In [2]:
# importing the important libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
#reading data 

data = pd.read_csv('Restaurant_Reviews.tsv', sep = '\t', quoting= 3)

In [4]:
data.head()

# label = 1 indicates positive review whereas label = 0 indicates negative review 

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [6]:
# counting the number of positive and negative reviews 

data['Liked'].value_counts()

0    500
1    500
Name: Liked, dtype: int64

# Cleaning the text data 

In [35]:
# stopwords.words('english')

In [44]:
#text preprocessing , removing stopwords , lemmatizing , only keeping alphabets a-z and A-Z and removing the rest from the text 
# a very important part is data cleaning 


import time

start = time.time()

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0,len(data)):
    result = re.sub('[^a-zA-Z]',' ',data['Review'][i])
    result = result.lower()
    result = result.split()
    result = [lemmatizer.lemmatize(word) for word in result if word not in stopwords.words('english')]
    result = " ".join(result)
    corpus.append(result)
    
end = time.time()

print(corpus)

print("The time of execution of above program is :", end-start)

The time of execution of above program is : 1.9742746353149414


In [48]:
corpus[100]

'server fantastic found wife love roasted garlic bone marrow added extra meal another marrow go'

In [63]:
# Tfidf Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf_v = TfidfVectorizer(max_features = 1500, ngram_range = (1,3))
X = Tfidf_v.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [70]:
X.shape

(1000, 1500)

In [71]:
y = data['Liked']
y.shape
# y[:10]

(1000,)

In [72]:
# Dividing the dataset into training and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,random_state = 0)

In [73]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((800, 1500), (200, 1500), (800,), (200,))

In [74]:
# checking the feature names 
Tfidf_v.get_feature_names()[:50]

['absolutely',
 'absolutely amazing',
 'acknowledged',
 'actually',
 'added',
 'ago',
 'almost',
 'also',
 'also taste',
 'although',
 'always',
 'always great',
 'always hit',
 'amazing',
 'ambiance',
 'ambience',
 'amount',
 'another',
 'another minute',
 'another said',
 'anyone',
 'anything',
 'anytime',
 'anytime soon',
 'anyway',
 'appetizer',
 'area',
 'around',
 'around like',
 'arrived',
 'ask',
 'asked',
 'assure',
 'ate',
 'atmosphere',
 'atmosphere fun',
 'attack',
 'attentive',
 'attitude',
 'authentic',
 'authentic thai',
 'average',
 'average best',
 'avoid',
 'avoid place',
 'away',
 'awesome',
 'awesome service',
 'awful',
 'baby']

In [75]:
Tfidf_v.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 1500,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [76]:
count_df = pd.DataFrame(X_train, columns = Tfidf_v.get_feature_names())
count_df.tail(20)

Unnamed: 0,absolutely,absolutely amazing,acknowledged,actually,added,ago,almost,also,also taste,although,...,wow,wrap,wrong,year,year ago,yet,yum,yummy,zero,zero star
780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.558809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# plotting confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Gaussian Naive Bayes Algorithm - Good for text data

In [78]:
from sklearn.naive_bayes import GaussianNB

In [79]:
classifier = GaussianNB()

In [80]:
classifier.fit(X_train , y_train)

GaussianNB()

In [82]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1], dtype=int64)

In [83]:
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report 

In [85]:
accuracy_score(y_test , y_pred)
# 200 * 0.69

138.0

In [86]:
confusion_matrix(y_test , y_pred)

array([[55, 42],
       [20, 83]], dtype=int64)

In [88]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.73      0.57      0.64        97
           1       0.66      0.81      0.73       103

    accuracy                           0.69       200
   macro avg       0.70      0.69      0.68       200
weighted avg       0.70      0.69      0.69       200

