In [12]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [4]:
ds = pd.read_csv("Datasets\Restaurant_Reviews.tsv",
                sep = "\t")

In [31]:
ds.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# nltk.download("stopwords")
stops = stopwords.words("english")

In [6]:
ps = PorterStemmer()

In [8]:
corpus = []

for i in np.arange(0, ds.shape[0]):
    review = ds["Review"][i]
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split()
    
    # Below line 
    review = [ps.stem(word) for word in review if not word in set(stops)]
    review = " ".join(review)
    corpus.append(review)

In [16]:
#help(CountVectorizer)

#vectorizer = CountVectorizer(max_features = 800)
""""""# max_features : int, default=None
 |      If not None, build a vocabulary that only consider the top
 |      max_features ordered by term frequency across the corpus.
 |  
 |      This parameter is ignored if vocabulary is not None.""""""

In [23]:
vtz = CountVectorizer(max_features = 800)
X = vtz.fit_transform(corpus).toarray()

print(vtz.get_feature_names_out())

y = ds["Liked"]

['absolut' 'acknowledg' 'actual' 'ad' 'ago' 'almost' 'also' 'although'
 'alway' 'amaz' 'ambianc' 'ambienc' 'amount' 'anoth' 'anyon' 'anyth'
 'anytim' 'anyway' 'apolog' 'appet' 'area' 'around' 'arriv' 'articl' 'ask'
 'assur' 'ate' 'atmospher' 'atroci' 'attach' 'attack' 'attent' 'attitud'
 'auju' 'authent' 'averag' 'avocado' 'avoid' 'aw' 'away' 'awesom' 'babi'
 'bachi' 'back' 'bacon' 'bad' 'bagel' 'bakeri' 'bar' 'bare' 'bartend'
 'basic' 'bathroom' 'batter' 'bay' 'bean' 'beat' 'beauti' 'becom' 'beef'
 'beer' 'behind' 'believ' 'belli' 'best' 'better' 'beyond' 'big' 'bill'
 'biscuit' 'bisqu' 'bit' 'bite' 'black' 'bland' 'blow' 'boba' 'boot'
 'bother' 'bowl' 'box' 'boy' 'boyfriend' 'bread' 'break' 'breakfast'
 'brick' 'bring' 'brought' 'brunch' 'buck' 'buffet' 'bug' 'build' 'bunch'
 'burger' 'busi' 'butter' 'bye' 'cafe' 'cake' 'call' 'came' 'cannot'
 'cant' 'car' 'care' 'cashier' 'char' 'charcoal' 'charg' 'cheap' 'check'
 'chees' 'cheeseburg' 'chef' 'chewi' 'chicken' 'chines' 'chip' 'choos'

In [24]:
rf = RandomForestClassifier(random_state = 2022)

In [25]:
kfold = StratifiedKFold(n_splits = 5,
                       shuffle = True,
                       random_state = 2022)

In [26]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2022, 'verbose': 0, 'warm_start': False}


In [27]:
params = {"max_features": [10, 50, 100, 200]}

In [28]:
gcv = GridSearchCV(rf,
                  param_grid = params,
                  verbose = 3,
                  scoring = "roc_auc",
                  cv = kfold)
gcv.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ...................max_features=10;, score=0.828 total time=   0.6s
[CV 2/5] END ...................max_features=10;, score=0.882 total time=   0.4s
[CV 3/5] END ...................max_features=10;, score=0.827 total time=   0.4s
[CV 4/5] END ...................max_features=10;, score=0.850 total time=   0.4s
[CV 5/5] END ...................max_features=10;, score=0.838 total time=   0.4s
[CV 1/5] END ...................max_features=50;, score=0.804 total time=   0.7s
[CV 2/5] END ...................max_features=50;, score=0.878 total time=   0.8s
[CV 3/5] END ...................max_features=50;, score=0.821 total time=   0.7s
[CV 4/5] END ...................max_features=50;, score=0.840 total time=   0.7s
[CV 5/5] END ...................max_features=50;, score=0.817 total time=   0.7s
[CV 1/5] END ..................max_features=100;, score=0.775 total time=   1.0s
[CV 2/5] END ..................max_features=100;,

In [29]:
print(gcv.best_params_)

{'max_features': 10}


In [30]:
print(gcv.best_score_)

0.84489
