In [1]:
#import libraries
import pandas as pd

In [2]:
#load saved file as pandas dataframe
df = pd.read_csv('yelp_dataset\\review_classification.csv', encoding = "utf-8", usecols = ['stars','text'])

In [3]:
#Data details analysis
df.head(5)

Unnamed: 0,stars,text
0,2.0,Very busy and noisy restaurant.\r\nAsparagas w...
1,5.0,On yelp 5 stars = Woohoo! as good as it gets! ...
2,5.0,A great culinary experience from start to fini...
3,5.0,Had the steak salad. It was amazing. Also had ...
4,5.0,Had Dinner at Delmonico on 11-5-15. Still the...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709155 entries, 0 to 709154
Data columns (total 2 columns):
stars    709155 non-null float64
text     709155 non-null object
dtypes: float64(1), object(1)
memory usage: 10.8+ MB


In [5]:
df.describe

<bound method NDFrame.describe of         stars                                               text
0         2.0  Very busy and noisy restaurant.\r\nAsparagas w...
1         5.0  On yelp 5 stars = Woohoo! as good as it gets! ...
2         5.0  A great culinary experience from start to fini...
3         5.0  Had the steak salad. It was amazing. Also had ...
4         5.0  Had Dinner at Delmonico on 11-5-15.  Still the...
5         3.0  Disappointed. Steak overdone while waiter have...
6         2.0  Rib eye steak for $51 should be pretty darn go...
7         5.0  Amazing restaurant. Lets start off by saying r...
8         4.0  4 Star for the whole experience and 5 stars fo...
9         5.0  The meal started with Ryan greeting us and tak...
10        5.0  This place is first class in every way. Lobste...
11        5.0  This place is amazing!!!! Came here for an ann...
12        1.0  Went there for my anniversary.  Knowing that t...
13        3.0  5 stars for our waitress. Her service was

In [6]:
# For this project and opinion mining, we have set stars that higher than 4 to be positive, else negative
df['target'] = df['stars'] > 4
target_value = df['target'].values
#print the target values
target_value

array([False,  True,  True, ...,  True,  True,  True])

In [7]:
# understand data pattern using mean and std
target_value.mean(), target_value.std(), target_value.shape

(0.47785321967693944, 0.4995092793145308, (709155,))

In [8]:
# Get the reviews of dataframs
reviews = df['text'].values
reviews[0]

'Very busy and noisy restaurant.\r\nAsparagas was cooked perfectly, however quite flavorless. The mashed potatoes were tasty.  \r\nFor the price, the spinach should have been fresh and the cream sauce needs improvement. \r\nMy organic filet was good and nicely cooked to medium rare, however not near as tasty as other organic beef I have had for half the price.\r\nThe New Orleans gumbo was a tad too salty.  The yorkshire style buns were average and were cold.  \r\nThe key lime pie was average.  The tartness was lacking.  The apple pie was a disappointment, with a doughy flavoured crust.\r\nAnother thing that  high end restaurants need to learn is how to choose great coffees like good wines.  I asked where the beans were from and they had no idea.  I would expect excellence in all areas of my food consumption and yes, even with my coffee. The espresso was extremely poor.\r\nThis is a restaurant that is way overpriced for average to good tasting food.  Nothing was outstanding or excellent

In [9]:
# We have split the data into 80-20 split
from sklearn.cross_validation import train_test_split



In [10]:
x_train, x_test, y_train, y_test = train_test_split(reviews, target_value, test_size = 0.2, random_state = 66)

In [11]:
# We use TFIDF for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(stop_words = 'english', max_features = 350)

In [12]:
# Model Training and fitiing
x_train_ = vec.fit_transform(x_train).toarray()
x_train_.shape

(567324, 350)

In [13]:
# bag of words of tfidf
bow = vec.get_feature_names()
#fitting ntest data
x_test_ = vec.fit_transform(x_test).toarray()
x_test_.shape

(141831, 350)

In [14]:
#Similarity search in the corpus
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [15]:
rand_no = 66
search_rev = x_test[rand_no]
search_revs = [search_rev]
#print randon reviews
print(search_revs)

["We didn't eat here for fear of not finishing our food and having to get spanked lol but we did order the giant jello syringe shots which were good and strong! Wearing the hospital gown was fun and it was fun watching other people receive their punishment for not finishing their meals haha."]


In [16]:
#to array
vector_ = vec.transform(search_revs).toarray()

In [17]:
#check similarity scores
sim_scores = cosine_similarity(vector_, x_train_)

In [18]:
sim_scores

array([[0.15609274, 0.        , 0.        , ..., 0.03885151, 0.        ,
        0.01914538]])

In [19]:
# check for top 5 similar reviews
n = 5
#sort the array based on similarity scores and return the top 5
reviews = [x_train[i] for i in np.argsort(sim_scores[0])[::-1][:n]] 

In [20]:
# print the top 5 most similar reviews
#print review
print('Review:')
print(search_rev)

#print similar reviews
print('\nTop %s most similar reviews are displayed below:' % n)
for i, j in enumerate(reviews):
    print('Review %s:' % i)
    print('\n',j,'\n')

Review:
We didn't eat here for fear of not finishing our food and having to get spanked lol but we did order the giant jello syringe shots which were good and strong! Wearing the hospital gown was fun and it was fun watching other people receive their punishment for not finishing their meals haha.

Top 5 most similar reviews are displayed below:
Review 0:

 the garlic knots are off the hook, so fresh and light...perfect amount of garlic and butter...a must try 

Review 1:

 Food was good! But when i ordered my food ( oxtail soup ) which is great! I asked my server if i could have some garlic on the side with it! Guess what??? The cafe dont have garlic! Are you kidding me? A restaurant with NO GARLIC! Lol... sooo if you are alergic or dont like garlic, this is the place for you! 

Review 2:

 Yum a ding a ding dong!! We had the triple garlic, Carolina gold,  and curry! Soooo good!! 

Review 3:

 Last night I ordered a pizza with garlic as one of the toppings. I expected some minced garl

Classifying the reviews as positive or negative

In [21]:
# Approach 1 - baseline approach using Naive bayes
from sklearn.naive_bayes import GaussianNB

In [22]:
#fit the model
nb = GaussianNB()
nb.fit(x_train_, y_train)

GaussianNB(priors=None)

In [23]:
# Accuracy score for training set with guassian NB
nb.score(x_train_, y_train)

0.7530088626604903

In [24]:
# Accuracy score for test set using guassion NB
nb.score(x_test_, y_test)

0.6589109574070549

In [25]:
#To fine tune the parameters, we have used grid search CV
#import relevant libraries
#import metrices - classification report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [26]:
#Impoprt logistic regression model
from sklearn.linear_model import LogisticRegression

In [27]:
# Parameter tuning values
param_grid = [{'penalty':['l1'], 'C':[0.1, 1, 10]},
              {'penalty':['l2'], 'C':[0.1, 1, 10]}]

accuracy = ['accuracy']

In [28]:
for i in accuracy:
    print("Fine tuning the hyper-parameters for %s" % i + "\n")
    #pass tuning parameters
    model = GridSearchCV(LogisticRegression(), param_grid, cv=5,scoring=i)
    #fit the model
    model.fit(x_train_[:500,:], y_train[:500])
    print("Best parameters found on training set:\n")
    print(model.best_params_)
    #print classification report
    print("\nClassification report:\n")
    actual, predicted = y_test, model.predict(x_test_)
    print(classification_report(actual, predicted))

Fine tuning the hyper-parameters for accuracy

Best parameters found on training set:

{'C': 1, 'penalty': 'l2'}

Classification report:

             precision    recall  f1-score   support

      False       0.63      0.83      0.72     73967
       True       0.72      0.47      0.57     67864

avg / total       0.68      0.66      0.65    141831



In [29]:
#fit logistic regression model with fine tuned parameters
lr = LogisticRegression(C =1 ,penalty = 'l2')
lr.fit(x_train_, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
# Accuracy score for training set using logistic regression
lr.score(x_train_, y_train)

0.7990389971162862

In [31]:
# Accuracy score for test set using logistic regression
lr.score(x_test_, y_test)

0.7023006253921921

In [32]:
# Print the top 20 features or words that make positive prediction
n = 20
[bow[i] for i in np.argsort(lr.coef_[0])[::-1][:n]]

['amazing',
 'best',
 'thank',
 'awesome',
 'delicious',
 'perfect',
 'highly',
 'fantastic',
 'excellent',
 'favorite',
 'great',
 'wonderful',
 'love',
 'perfectly',
 'loved',
 'definitely',
 'vegas',
 'happy',
 'absolutely',
 'chef']

In [33]:
# Print the top 20 features or words that make negative prediction
[bow[i] for i in np.argsort(lr.coef_[0])[:n]]

['worst',
 'horrible',
 'rude',
 'ok',
 'okay',
 'slow',
 'dry',
 'decent',
 'wasn',
 'reason',
 'wouldn',
 'overall',
 'bad',
 'maybe',
 'money',
 'cold',
 'pretty',
 'didn',
 'used',
 'told']

In [34]:
#import random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [35]:
# fine tune the hyper-parameters
param_grid = [{'n_estimators':[5, 10,15,20], 'min_samples_leaf':[1, 3, 5, 7]},
              {'n_estimators':[5, 10,15,20], 'min_samples_leaf':[1, 3, 5, 7]}]

#scoring metrics
accuracy = ['accuracy']

In [36]:
for i in accuracy:
    print("Fine tuning the hyper-parameters for %s" % i + "\n")
    model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring=i)
    #fitting the model
    model.fit(x_train_[:500,:], y_train[:500])
    print("Best parameters found on training set:\n")
    print(model.best_params_)
    #print classification report
    print("\nClassification report:\n")
    actual, predicted = y_test, model.predict(x_test_)
    print(classification_report(actual, predicted))

Fine tuning the hyper-parameters for accuracy

Best parameters found on training set:

{'n_estimators': 15, 'min_samples_leaf': 5}

Classification report:

             precision    recall  f1-score   support

      False       0.63      0.79      0.70     73967
       True       0.68      0.49      0.57     67864

avg / total       0.65      0.64      0.64    141831



In [37]:
rf = RandomForestClassifier(max_depth = None,n_estimators = 15, min_samples_leaf = 1)
rf.fit(x_train_, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
# Accuracy score for training set using random forests
rf.score(x_train_, y_train)

0.9943559588524371

In [39]:
# Accuracy score for test set using random forests
rf.score(x_test_, y_test)

0.6867257510699354

In [40]:
# Print the top 20 most important features/words
n = 20
[bow[i] for i in np.argsort(rf.feature_importances_)[::-1][:n]]

['amazing',
 'great',
 'best',
 'delicious',
 'love',
 'awesome',
 'good',
 'ok',
 'vegas',
 'definitely',
 'didn',
 'food',
 'bad',
 'like',
 'excellent',
 'favorite',
 'place',
 'asked',
 'friendly',
 'service']

In [41]:
#import libraries for cross validation score
#import time to check execution time for different models
from sklearn.model_selection import cross_val_score
import time

In [42]:
start_time = time.time()
# naive bayes cross validation 
cv_ = cross_val_score(nb,x_train_,y_train,cv = 5, scoring="accuracy")
exec_time = time.time() - start_time
#print execution time
print("Execution Time for Navie Bayes -: ",exec_time," Accuracy: ",np.mean(cv_))

Execution Time for Navie Bayes -:  52.23256754875183  Accuracy:  0.7528519865428325


In [43]:
start_time = time.time()
# logistic regression execution time
cv_ = cross_val_score(lr,x_train_, y_train, cv = 5,scoring="accuracy")
exec_time = time.time() - start_time
#print execution time
print("Execution time for Logistic Regression -: ",exec_time," Accuracy: ",np.mean(cv_))

Execution time for Logistic Regression -:  48.92613506317139  Accuracy:  0.7985718919218331


In [44]:
start_time = time.time()
# random forest execution time
cv_ = cross_val_score(rf,x_train_, y_train, cv = 5, scoring="accuracy")
exe_time = time.time() - start_time
#print execution time
print("Execution time for Random Forest -: ",exec_time," Accuracy: ",np.mean(cv_))

Execution time for Random Forest -:  48.92613506317139  Accuracy:  0.778463449191958


From the different models we can see that the accuracy is best for Logistic regression. Random forests accuracy is less comparitively.