## Amazon and IMDB review sentiment classification using SpaCy

In [None]:
# Application of NLP
# text classification
# spa filters
 #voice text nessaging sentiment analysis
    # search suggestion 
    #search corrections 
    # automatic review analysis system
    # machine translation

In [None]:
# Data Cleaninig options
# case normalization
# removing punct, stop words or specail symbols
# lemmatization or stemming
# POS tagging
# entity detection
# bag of words
# TF_IDF

In [None]:
# Bag-of-words simplest word embedding techniques

In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [10]:
text= "this is first sentence for today. those we are going to analyze"

In [4]:
doc=nlp(text)

In [5]:
for token in doc:
    print(token)

this
is
first
sentence
for
today
those
we
are
going
to
analyze


In [6]:
sentence =nlp.create_pipe('sentencizer')

In [7]:
nlp.add_pipe(sentence, before = 'parser') # beofre parsing coz token will become a single word 

In [11]:
doc = nlp(text)

In [12]:
for sent in doc.sents:
    print(sent)

this is first sentence for today.
those we are going to analyze


In [13]:
from spacy.lang.en.stop_words import STOP_WORDS

In [14]:
stop_words = list(STOP_WORDS)

In [15]:
len(stop_words)

326

In [16]:
# removing stop words from the sentences
for token in doc:
    if token.is_stop == False:
        print(token)

sentence
today
.
going
analyze


## Lemmatization

In [17]:
 doc = nlp('run runs running runner')

In [18]:
for lem in doc:
    print(lem.text, lem.lemma_)

run run
runs run
running run
runner runner


## POS tagging

In [19]:
doc = nlp('All is well from India')

In [20]:
for token in doc:
    print(token.text, token.pos_)

All DET
is AUX
well ADJ
from ADP
India PROPN


In [21]:
displacy.render(doc,style='dep')

## Entity Detection

In [25]:
doc="Tambola, also known as Tombola, Indian Bingo or Housie is a popular game that is believed to be originated in Italy in early 1500s. ... Each player must buy at least one ticket to enter a game. Tambola is played with Numbers (1-90) being called out one at a time and players striking out those Numbers on their Tickets"

In [26]:
text =nlp(doc)

In [27]:
displacy.render(text, style='ent')

## Text Classification

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

  return f(*args, **kwds)


In [30]:
yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header=None )

In [35]:
yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [34]:
col_names = ['Review','Sentiment']
yelp.columns=col_names

In [36]:
yelp.shape

(1000, 2)

In [37]:
amazon_text=pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None)
amazon_text.columns=col_names

In [38]:
amazon_text.shape

(1000, 2)

In [40]:
imdb_rev = pd.read_csv('imdb_labelled.txt',sep='\t',header=None)
imdb_rev.columns=col_names

In [42]:
imdb_rev.shape

(748, 2)

In [43]:
data = yelp.append([amazon_text,imdb_rev],ignore_index=True)

In [44]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [45]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [47]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

# tokenization

In [48]:
import string # to get punctuations

In [49]:
punct = string.punctuation

In [50]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [52]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens=[]
    for token in doc:
        if token.lemma_!="-PRON-":
            temp = token.lemma_.lower().strip() #strip out the extra characters
        else:
            temp = token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [54]:
text_data_cleaning("Hello what's up how is it going with you in this pandemic")

['hello', 'pandemic']

## Vectorization Feature Engineering(Tf-Idf)

In [57]:
from sklearn.svm import LinearSVC

In [58]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [59]:
X = data['Review']
Y= data['Sentiment']

In [60]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state= 42)

In [62]:
X_train.shape, Y_test.shape

((2198,), (550,))

In [63]:
clf = Pipeline([('tfidf',tfidf),('clf',classifier)])

In [64]:
clf.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_data_cleaning at 0x7f263d28e830>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept

In [65]:
Y_pred = clf.predict(X_test)

In [68]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       285
           1       0.78      0.74      0.76       265

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [69]:
confusion_matrix(Y_test,Y_pred)

array([[230,  55],
       [ 68, 197]])

In [70]:
clf.predict(['Hell yeah, we are perfectly alright'])

array([0])