In [28]:
# Importing the libraries
import numpy as np
import pandas as pd

In [29]:
# Importing the dataset. tsv file is tab separated files. delimiter ='\t' is used to specify that it is a tab separated file
dataset = pd.read_csv('./Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset
#dataset is the dataframe containing 1000 reviews on a restaurant. it contains 2 columns Review and Liked(0 or 1)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [30]:
#re is the regular expression library
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#corpus will contain all our reviews
corpus = []

In [31]:
dataset['Review'][0]

'Wow... Loved this place.'

In [32]:
# regular expression used
#expressions other than 26 alphabets will be dropped from the sentence
# the second field ' ' means the drop action

# here .. is dropped from the sentence 
example = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
example

'Wow    Loved this place '

In [33]:
#converting everything to lowercase
example=example.lower()
example

'wow    loved this place '

In [34]:
#split library of python can be used to tokenize the sentence
example=example.split()
example
#like tokenization

['wow', 'loved', 'this', 'place']

In [35]:
#example of stemming
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned"]
for w in example_words:
    print(ps.stem(w))


python
python
python
python


In [36]:
#example of stemming for a sentence
#running is converted to run
#tracking is converted to track
#days is converted to day

new_text="I am running and tracking my speed after days"
words= new_text.split()
for w in words:
    print(ps.stem(w))

I
am
run
and
track
my
speed
after
day


In [37]:
# removing stopwords

stop_words = stopwords.words('english')
print([ word for word in example if not word in stop_words])

['wow', 'loved', 'place']


In [48]:
# applying the entire pre-processing steps in one for loop to entire dataframe
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in stop_words]
    # rejoin all string array elements 
    # to create back into a string 
    review = ' '.join(review)
    corpus.append(review)

In [39]:
#example of CountVectorizer to build sparse matrix

from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

# create the instance
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print(vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(text)

# summarize encoded vector. sparse matrix is constructed
print(type(vector))
print(vector.toarray())

#the words are arranged as per dictionary order.That's why brown is at 0th index, dog at 1st index and so on.
# 'the' appears two times in the sentence that's why it's count is 2

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [40]:
# encode another sentence on the same trained countvectorizer model

text2 = ["the dog"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 1 0 0 0 0 0 1]]


In [41]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 1500 feature. 
# "max_features" is attribute to 
# experiment with to get better results 
cv = CountVectorizer(max_features = 1500)  

# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()  
  
# y contains answers if review 
# is positive or negative 
y = dataset.iloc[:, 1].values

In [42]:
X.shape

(1000, 1500)

In [43]:
y.shape
# X and y should have same number of rows for train_test_split to be used.

(1000,)

In [44]:
# Splitting the dataset into 
# the Training set and Test set 
import sklearn
from sklearn.model_selection import train_test_split 
  
# experiment with "test_size" 
# to get better results 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) 

In [45]:
# Multinomial NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))

Confusion Matrix:
 [[119  33]
 [ 34 114]]


Accuracy is  77.67 %
Precision is  0.78
Recall is  0.77


In [46]:
# Bernoulli NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB(alpha=0.8)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))


Confusion Matrix:
 [[115  37]
 [ 32 116]]


Accuracy is  77.0 %
Precision is  0.76
Recall is  0.78


In [47]:
# Logistic Regression

# Fitting Logistic Regression to the Training set
from sklearn import linear_model
classifier = linear_model.LogisticRegression(C=1.5)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Confusion Matrix:
 [[125  27]
 [ 43 105]]


Accuracy is  76.67 %
Precision is  0.8
Recall is  0.71
