In [None]:
#mount the drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [None]:
print("Shoe first ten reviews: ",  dataset.iloc[0:10])
# OR
dataset.head(10)

Shoe first ten reviews:                                                Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
5     Now I am getting angry and I want my damn pho.      0
6              Honeslty it didn't taste THAT fresh.)      0
7  The potatoes were like rubber and you could te...      0
8                          The fries were great too.      1
9                                     A great touch.      1


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [None]:
# clean review data
import re # regular expression
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #convert word into root word


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sys import ps1
corpus = []

for i in range(0, 1000):
  if i<1: print("Original review: ", dataset["Review"][i])
  review = re.sub('[^a-zA-Z]',' ', dataset['Review'][i]) #remove not needed words
  if i<1: print("Step1: Keep letters from a-z, A-Z and space::", review)
  # convert to normal case
  review = review.lower()
  if i<1: print("Step 2: Convert review to lower case: ", review)
  review = review.split()
  if i<1: print("Step 3: Convert review to list of words: ", review)
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  if i<1: print("Step 4: Remove stopwords and convert to rootwords: ", review)
  review = ' '.join(review) # join all the words created
  if i<1: print("Step 5: Join all the words: ", review)
  corpus.append(review) # append = put dataset into corpus

print("Show first ten reviews: \n", dataset.iloc[0:10])
print("Cleaned version: ", corpus[0:10])

Original review:  Wow... Loved this place.
Step1: Keep letters from a-z, A-Z and space:: Wow    Loved this place 
Step 2: Convert review to lower case:  wow    loved this place 
Step 3: Convert review to list of words:  ['wow', 'loved', 'this', 'place']
Step 4: Remove stopwords and convert to rootwords:  ['wow', 'love', 'place']
Step 5: Join all the words:  wow love place
Show first ten reviews: 
                                               Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
5     Now I am getting angry and I want my damn pho.      0
6              Honeslty it didn't taste THAT fresh.)      0
7  The potatoes were like rubber and you could te...      0
8                          The fries were great too.      1

In [None]:
# create bag of words and vector matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) # up to u how much u assume
X = cv.fit_transform(corpus).toarray() # data/review rows
y = dataset.iloc[:,1].values # labels
print("BoW vocab: \n", cv.get_feature_names_out())
print("Vector Matrix \n", X[1,0:300])
print("Labels: \n", y[0:10])

BoW vocab: 
 ['absolut' 'account' 'ach' ... 'yum' 'yummi' 'zero']
Vector Matrix 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0]
Labels: 
 [1 0 0 1 1 0 0 0 1 1]


In [None]:
# split the data for training and testing by 70/30
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)
print("Shape of original data: ", X.shape, y.shape)
print("Shape of Training data: ", X_train.shape, y_train.shape)
print("Shape of testing data: ", X_test.shape, y_test.shape)

#70/30 is called static split

Shape of original data:  (1000, 1500) (1000,)
Shape of Training data:  (700, 1500) (700,)
Shape of testing data:  (300, 1500) (300,)


In [None]:
# fitting single learner: naive bayes model
from sklearn.naive_bayes import GaussianNB
clf1 = GaussianNB().fit(X_train, y_train)
y_pred = clf1.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: ", clf1.score(X_test, y_test)*100)

[[ 83  69]
 [ 19 129]]
Accuracy:  70.66666666666667


In [None]:
#fit the bagging algorihtm is RF
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
y_pred2 = clf2.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print("Accuracy: ", clf2.score(X_test, y_test)*100)

[[ 76  76]
 [ 19 129]]
Accuracy:  68.33333333333333


In [None]:
# boosting mdoel like GB
from sklearn.ensemble import GradientBoostingClassifier
# loop 100 to 1500 times to find the best max_features
for i in range(100,1500,100):
  clf3 = GradientBoostingClassifier(n_estimators = 50, learning_rate=0.9, max_features=i, max_depth=1, random_state=0).fit(X_train, y_train)

y_pred3 = clf3.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred3)
#print(cm3)
print("Accuracy: ", clf3.score(X_test, y_test)*100)

Accuracy:  79.33333333333333


In [None]:
# peint report of confusion
from sklearn.metrics import classification_report
target_labels = ["Negative", "Positive"]
print(classification_report(y_test, y_pred, target_names=target_labels))

              precision    recall  f1-score   support

    Negative       0.81      0.55      0.65       152
    Positive       0.65      0.87      0.75       148

    accuracy                           0.71       300
   macro avg       0.73      0.71      0.70       300
weighted avg       0.73      0.71      0.70       300



In [None]:
|