In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from gensim.models.word2vec import Word2Vec

### Read Yelp reviews data set and check the contents of dataframe

In [156]:
data=pd.read_csv('yelp.csv')
data.tail()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
9995,VY_tvNUCCXGXQeSvJl757Q,2012-07-28,Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...,review,_eqQoPtQ3e3UxLE4faT6ow,1,2,0
9996,EKzMHI1tip8rC1-ZAy64yg,2012-01-18,2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...,review,ROru4uk5SaYc3rg8IU7SQw,0,0,0
9997,53YGfwmbW73JhFiemNeyzQ,2010-11-16,jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...,review,gGbN1aKQHMgfQZkqlsuwzg,0,0,0
9998,9SKdOoDHcFoxK5ZtsgHJoA,2012-12-02,5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...,review,0lyVoNazXa20WzUyZPLaQQ,0,0,0
9999,pF7uRzygyZsltbmVpjIyvw,2010-10-16,vWSmOhg2ID1MNZHaWapGbA,5,4-5 locations.. all 4.5 star average.. I think...,review,KSBFytcdjPKZgXKQnYQdkA,0,0,0


### Create custom analyzer to clean punctuation, stopwords, numerics, stem words

In [157]:
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
def clean_review_text(review):
    """
    1. Remove Punctuation
    2. Remove Stop Words
    3. Apply SnowBall Stemmer to remove morphological affixes from words, leaving only the word stem.
    """
    stemmer = SnowballStemmer("english")
    no_punc=[c for c in review if c not in string.punctuation]
    no_punc=''.join(no_punc)
    return [stemmer.stem(word) for word in no_punc.split() if word.lower() not in stopwords.words('english') and word.isalpha()]
    

### clean each review using custom analyzer. This cleaned text will be list of lists for all the reviews , which can  later be used to create our own corpus for W2V Model

In [158]:

# tqdm is for printing the status bar
from tqdm import tqdm
preprocessed_reviews = []
for sentence in tqdm(data['text'].values):
    sentence=clean_review_text(sentence)
    preprocessed_reviews.append(sentence)

100%|██████████| 10000/10000 [05:03<00:00, 32.98it/s]


In [197]:
preprocessed_reviews[0:3]
#len(preprocessed_reviews)

[['wife',
  'took',
  'birthday',
  'breakfast',
  'excel',
  'weather',
  'perfect',
  'made',
  'sit',
  'outsid',
  'overlook',
  'ground',
  'absolut',
  'pleasur',
  'waitress',
  'excel',
  'food',
  'arriv',
  'quick',
  'semibusi',
  'saturday',
  'morn',
  'look',
  'like',
  'place',
  'fill',
  'pretti',
  'quick',
  'earlier',
  'get',
  'better',
  'favor',
  'get',
  'bloodi',
  'mari',
  'phenomen',
  'simpli',
  'best',
  'ive',
  'ever',
  'im',
  'pretti',
  'sure',
  'use',
  'ingredi',
  'garden',
  'blend',
  'fresh',
  'order',
  'amaz',
  'everyth',
  'menu',
  'look',
  'excel',
  'white',
  'truffl',
  'scrambl',
  'egg',
  'veget',
  'skillet',
  'tasti',
  'delici',
  'came',
  'piec',
  'griddl',
  'bread',
  'amaz',
  'absolut',
  'made',
  'meal',
  'complet',
  'best',
  'toast',
  'ive',
  'ever',
  'anyway',
  'cant',
  'wait',
  'go',
  'back'],
 ['idea',
  'peopl',
  'give',
  'bad',
  'review',
  'place',
  'goe',
  'show',
  'pleas',
  'everyon',
  

### Train Word2Vec model from scratch with gensim using the list of lists we crated form review texts

In [216]:

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# min_count = 2 considers only words that occured atleast 2 times
w2v_model=Word2Vec(preprocessed_reviews,min_count=2,size=300)
print(w2v_model.wv.most_similar('great'))
print('='*50)
print(w2v_model.wv.most_similar('worst'))
print(w2v_model.wv['worst'])

[('awesom', 0.9006556868553162), ('fantast', 0.8613003492355347), ('enjoy', 0.8420926928520203), ('wonder', 0.8332855701446533), ('excel', 0.8252487778663635), ('good', 0.8063887357711792), ('amaz', 0.7996500134468079), ('cheap', 0.7953023910522461), ('hotwir', 0.7921968698501587), ('casual', 0.7865387201309204)]
[('life', 0.9445294141769409), ('gotten', 0.9285500049591064), ('cleanest', 0.9175071120262146), ('bradley', 0.9172347784042358), ('jew', 0.9139502644538879), ('eaten', 0.9080661535263062), ('weve', 0.9065256714820862), ('part', 0.9053100943565369), ('tempephoenix', 0.9040247797966003), ('heard', 0.9039435386657715)]
[ 0.25224653 -0.02100236 -0.09549011 -0.25566593  0.03660142 -0.0782662
 -0.07191881  0.22030371 -0.10144444 -0.09333372 -0.38288698  0.06886412
 -0.01381888 -0.02306023  0.13424507  0.19485225 -0.32522297  0.37036988
 -0.06445486 -0.03232805 -0.04107588  0.16701028  0.1826102   0.10448657
  0.21379161  0.0945508  -0.12504855  0.07027788 -0.34442204 -0.21104077
  

In [218]:
print(w2v_model.wv.most_similar('wife'))

[('dilla', 0.9426113367080688), ('husband', 0.9396105408668518), ('marti', 0.9355227947235107), ('dad', 0.9271290302276611), ('benni', 0.9196608066558838), ('boyfriend', 0.9152238368988037), ('boop', 0.9145913124084473), ('carbonara', 0.9131380319595337), ('goround', 0.9117599725723267), ('campanell', 0.9076445698738098)]


### get all the w2vec words 

In [219]:
w2v_words=list(w2v_model.wv.vocab)
print(len(w2v_words))


12122


### Create a MeanEmbeddingVectroizer which uses  Avg w2v to convert sentenses into vectors(from all words in sentense)

In [220]:
#Source: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ 
from tqdm import tqdm
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
        for sent in tqdm(X): # for each review/sentence
            sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
            cnt_words =0; # num of words with a valid vector in the sentence/review
            for word in sent: # for each word in a review/sentence
                if word in w2v_words:
                    vec = w2v_model.wv[word]
                    sent_vec += vec
                    cnt_words += 1
            if cnt_words != 0:
                sent_vec /= cnt_words
            sent_vectors.append(sent_vec)
        return sent_vectors

### Lets split the input data to contain review with starts of 5 and 1

In [221]:
df=data[(data['stars']==5 )| (data['stars']==1)]
X_W2V= df['text']
y_W2V=df['stars']


### Clean Input features/text using custom analyzer we created above

In [222]:
clean_text = []
for sentence in tqdm(X_W2V):
    sentence=clean_review_text(sentence)
    clean_text.append(sentence)

100%|██████████| 4086/4086 [02:02<00:00, 33.23it/s]


In [223]:
print(len(clean_text))
print(clean_text[0:2])


4086
[['wife', 'took', 'birthday', 'breakfast', 'excel', 'weather', 'perfect', 'made', 'sit', 'outsid', 'overlook', 'ground', 'absolut', 'pleasur', 'waitress', 'excel', 'food', 'arriv', 'quick', 'semibusi', 'saturday', 'morn', 'look', 'like', 'place', 'fill', 'pretti', 'quick', 'earlier', 'get', 'better', 'favor', 'get', 'bloodi', 'mari', 'phenomen', 'simpli', 'best', 'ive', 'ever', 'im', 'pretti', 'sure', 'use', 'ingredi', 'garden', 'blend', 'fresh', 'order', 'amaz', 'everyth', 'menu', 'look', 'excel', 'white', 'truffl', 'scrambl', 'egg', 'veget', 'skillet', 'tasti', 'delici', 'came', 'piec', 'griddl', 'bread', 'amaz', 'absolut', 'made', 'meal', 'complet', 'best', 'toast', 'ive', 'ever', 'anyway', 'cant', 'wait', 'go', 'back'], ['idea', 'peopl', 'give', 'bad', 'review', 'place', 'goe', 'show', 'pleas', 'everyon', 'probabl', 'gripe', 'someth', 'faultther', 'mani', 'peopl', 'like', 'case', 'friend', 'arriv', 'pm', 'past', 'sunday', 'pretti', 'crowd', 'thought', 'sunday', 'even', 'though

In [167]:
len(y_W2V)

4086

### Convert the cleaned Input data into vectors using avg W2V (MeanEmbeddingVectroizer we created above)

In [224]:
sent_vectors_input=[[]]
model=MeanEmbeddingVectorizer(clean_text)
sent_vectors_input=model.transform(clean_text)

100%|██████████| 4086/4086 [00:19<00:00, 214.34it/s]


### Split the input data into Traning and Test Data

In [225]:
X_train_W2V, X_test_W2V, y_train_W2V, y_test_W2V = train_test_split(sent_vectors_input,y_W2V, test_size=0.2, random_state=101)



### Convert the cleaned test data into vectors using avg W2V (MeanEmbeddingVectroizer we created above)

In [170]:
# #convert teest data into mean vectors which 
# sent_vectors_test=[[]]
# model=MeanEmbeddingVectorizer(X_test_W2V)
# sent_vectors_test=model.transform(X_test_W2V)

In [171]:
#X_train_W2V, X_test_W2V, y_train_W2V, y_test_W2V = train_test_split(clean_text,y_W2V, test_size=0.3, random_state=101)

In [172]:
# #convert the training data into mean vectors
# sent_vectors_train=[[]]
# model=MeanEmbeddingVectorizer(X_train_W2V)
# sent_vectors_train=model.transform(X_train_W2V)

In [173]:
# #convert teest data into mean vectors which 
# sent_vectors_test=[[]]
# model=MeanEmbeddingVectorizer(X_test_W2V)
# sent_vectors_test=model.transform(X_test_W2V)

### Lets find the optimal K value using 10 fold cross validaiton score 

In [226]:
#lets keep the neighbours odd number form 1 to 25 
n_neigh=[i for i in range(26) if i%2!=0]
cv_scores=[]
#For each k value 
#1.Split the same training data with 10 fold and try to find the best K vlue .It returs list of scores.
#2.compute the mean score for each this neighbor value and store it
for i in n_neigh:
    model=KNeighborsClassifier(n_neighbors=i,weights='distance')
    scores = cross_val_score(model, X_train_W2V, y_train_W2V, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
#Compute the Error for each k value and print the kvalue that has minimum error
MSE=[1-scr for scr in cv_scores] 
print("Errors  are:",MSE)
print("Least Error is :", min(MSE))
print("k value to use is: ",n_neigh[MSE.index(min(MSE))])

Errors  are: [0.1900208251252321, 0.1606517701356449, 0.15116508133055662, 0.1536087503048723, 0.1508620851391156, 0.14780304309487635, 0.14872328849364924, 0.14872328849364946, 0.1496444719611265, 0.15025796889364185, 0.14719705071199418, 0.145972871053076, 0.149338661563573]
Least Error is : 0.145972871053076
k value to use is:  23


In [227]:

model_KNN=KNeighborsClassifier(n_neighbors=23,weights='distance')
model_KNN.fit(X_train_W2V, y_train_W2V)
predictions_KN=model_KNN.predict(X_test_W2V)

#Analyze Results
print(confusion_matrix(y_test_W2V,predictions_KN))
print(classification_report(y_test_W2V,predictions_KN))
print(accuracy_score(y_test_W2V,predictions_KN))

[[ 58  92]
 [ 11 657]]
              precision    recall  f1-score   support

           1       0.84      0.39      0.53       150
           5       0.88      0.98      0.93       668

    accuracy                           0.87       818
   macro avg       0.86      0.69      0.73       818
weighted avg       0.87      0.87      0.85       818

0.8740831295843521


### Lets use RF classifer to compare results with KNN

In [228]:

from sklearn.ensemble import RandomForestClassifier

X_train_RF_W2V, X_test_RF_W2V, y_train_RF_W2V, y_test_RF_W2V = train_test_split(sent_vectors_input,y_W2V, test_size=0.2, random_state=101)


model1=RandomForestClassifier()
model1.fit(X_train_RF_W2V, y_train_RF_W2V)
predictions_RF=model1.predict(X_test_RF_W2V)

print(confusion_matrix(y_test_RF_W2V,predictions_RF))
print(classification_report(y_test_RF_W2V,predictions_RF))
print(accuracy_score(y_test_RF_W2V,predictions_RF))

[[ 73  77]
 [ 24 644]]
              precision    recall  f1-score   support

           1       0.75      0.49      0.59       150
           5       0.89      0.96      0.93       668

    accuracy                           0.88       818
   macro avg       0.82      0.73      0.76       818
weighted avg       0.87      0.88      0.87       818

0.8765281173594132


## Lets use TF-IDF W2V  

### Fist compute the TF-IDF W2V for each word from the cleaned reviews with stars as 5 and 1 

In [None]:
# convert list of list into list of strings

In [229]:
reviews_string=list(map(' '.join,preprocessed_reviews))

In [204]:
reviews_string[0:2]

['wife took birthday breakfast excel weather perfect made sit outsid overlook ground absolut pleasur waitress excel food arriv quick semibusi saturday morn look like place fill pretti quick earlier get better favor get bloodi mari phenomen simpli best ive ever im pretti sure use ingredi garden blend fresh order amaz everyth menu look excel white truffl scrambl egg veget skillet tasti delici came piec griddl bread amaz absolut made meal complet best toast ive ever anyway cant wait go back',
 'idea peopl give bad review place goe show pleas everyon probabl gripe someth faultther mani peopl like case friend arriv pm past sunday pretti crowd thought sunday even thought would wait forev get seat said well seat girl come back seat someon els seat waiter came got drink order everyon pleasant host seat us waiter server price good well place order decid want share bake spaghetti calzon small here beef pizza tri calzon huge got smallest one person got small pizza awesom friend like pizza better 

##### Get the TF-IDF values for each word 

In [230]:
tf_idf_vectroizer =TfidfVectorizer()
tf_idf_vectroizer.fit_transform(reviews_string)


<10000x25512 sparse matrix of type '<class 'numpy.float64'>'
	with 546478 stored elements in Compressed Sparse Row format>

In [178]:
### Create a dictionary which will contain each feature/word as key and corresponding TF-IDF as value

In [231]:
dictionary = dict(zip(tf_idf_vectroizer.get_feature_names(), list(tf_idf_vectroizer.idf_)))

In [232]:
tf_idf_vectroizer.idf_

array([8.4186809 , 9.11182808, 9.51729319, ..., 9.51729319, 9.51729319,
       9.51729319])

In [233]:
print(max(tf_idf_vectroizer.idf_))
print(min(tf_idf_vectroizer.idf_))

9.517293186416572
1.7792408887272548


In [234]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vectroizer.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(clean_text): # for each review/sentence 
    sent_vec = np.zeros(300) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 4086/4086 [02:45<00:00, 24.64it/s]


In [235]:
tfidf_sent_vectors[0:2]

[array([ 1.17523705e-01,  5.93179261e-02,  6.01419858e-02, -2.71541181e-01,
        -2.51580368e-01, -2.57317663e-01, -1.30837711e-01, -5.94815061e-02,
        -1.16868554e-01, -5.90304903e-02, -3.26315937e-01,  1.71290373e-01,
        -1.71241059e-01,  1.12406905e-01, -7.17380385e-02,  6.64943500e-02,
        -1.31141760e-01,  2.08006331e-01, -1.51114679e-01, -6.19295455e-02,
         1.22057865e-01, -1.44803749e-01, -1.94574314e-01,  1.14193521e-01,
         1.48872522e-01,  1.92155831e-01, -1.44985800e-01, -1.30655096e-01,
        -4.35410594e-01, -2.98753940e-01,  1.71577481e-01, -1.24333487e-01,
         1.77292220e-01,  4.30290326e-04,  4.75290782e-02, -6.17484342e-02,
        -9.68141151e-02, -4.00205664e-03, -1.85403260e-01,  6.75381966e-02,
        -4.05337015e-03, -2.44661675e-03,  2.48271722e-02,  3.87210883e-01,
         5.66518724e-02,  2.96917530e-01, -4.32251723e-01, -1.93252636e-02,
        -1.83223170e-01, -3.05307707e-01, -1.26852014e-01, -6.10682554e-02,
        -1.9

In [211]:
#len(tfidf_sent_vectors)
len(y_W2V)

4086

In [236]:
X_train_TW2V, X_test_TW2V, y_train_TW2V, y_test_TW2V = train_test_split(tfidf_sent_vectors,y_W2V, test_size=0.3, random_state=101)

In [237]:
from sklearn.ensemble import RandomForestClassifier

model2=RandomForestClassifier()
model2.fit(X_train_TW2V, y_train_TW2V)
predictions_TRF=model2.predict(X_test_TW2V)

print(confusion_matrix(y_test_TW2V,predictions_TRF))
print(classification_report(y_test_TW2V,predictions_TRF))
print(accuracy_score(y_test_TW2V,predictions_TRF))

[[103 125]
 [ 33 965]]
              precision    recall  f1-score   support

           1       0.76      0.45      0.57       228
           5       0.89      0.97      0.92       998

    accuracy                           0.87      1226
   macro avg       0.82      0.71      0.75      1226
weighted avg       0.86      0.87      0.86      1226

0.8711256117455138


In [238]:
X_train_TW2V, X_test_TW2V, y_train_TW2V, y_test_TW2V = train_test_split(tfidf_sent_vectors,y_W2V, test_size=0.3, random_state=101)

model_KNN=KNeighborsClassifier(n_neighbors=23,weights='distance')
model_KNN.fit(X_train_TW2V, y_train_TW2V)
predictions_KN=model_KNN.predict(X_test_TW2V)

#Analyze Results
print(confusion_matrix(y_test_TW2V,predictions_KN))
print(classification_report(y_test_TW2V,predictions_KN))
print(accuracy_score(y_test_TW2V,predictions_KN))

[[ 71 157]
 [ 15 983]]
              precision    recall  f1-score   support

           1       0.83      0.31      0.45       228
           5       0.86      0.98      0.92       998

    accuracy                           0.86      1226
   macro avg       0.84      0.65      0.69      1226
weighted avg       0.86      0.86      0.83      1226

0.8597063621533442


### Lets test using  Naive Bayes  . As the Data is impalanced lets use fit_piror=False

In [None]:
## X_W2V has reviews text with stars=1 or 5 , y_W2V has class labels 1 or 5

In [300]:
X_train_NB, X_test_NB, y_train_NB, y_test_NB = train_test_split(X_W2V,y_W2V, test_size=0.3, random_state=101)

In [310]:
pipeline_NB=Pipeline([('countvec',CountVectorizer()),#analyzer=clean_review_text
              #      ('tfidf',TfidfTransformer()),
                     ('algorithm',MultinomialNB(alpha=1,class_prior=[.8, .2]))])#alpha=1000,fit_prior=False

pipeline_NB.fit(X_train_NB,y_train_NB)

predictions_NB=pipeline_NB.predict(X_test_NB)

#Analyze Results
print(confusion_matrix(y_test_NB,predictions_NB))
print(classification_report(y_test_NB,predictions_NB))
print(accuracy_score(y_test_NB,predictions_NB))

[[183  45]
 [ 38 960]]
              precision    recall  f1-score   support

           1       0.83      0.80      0.82       228
           5       0.96      0.96      0.96       998

    accuracy                           0.93      1226
   macro avg       0.89      0.88      0.89      1226
weighted avg       0.93      0.93      0.93      1226

0.932300163132137


### Observation: we see if we do not use TFIDF transforme the model is working better . fit_piror = False imporves  performance little bit as the data is imbalanced. We also see if we increase the alpha value the model performance reduces.