In [0]:
# Importing Libraries 
import numpy as np   
import pandas as pd  



In [0]:
# Import dataset 
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')  
print(dataset)

                                                Review  Liked
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]


**step2: Text Cleaning or Preprocessing**



In [0]:
# library to clean data 
import re 

# Natural Language Tool Kit 
import nltk 

nltk.download('stopwords') 

# to remove stopword 
from nltk.corpus import stopwords 

# for Stemming propose 
from nltk.stem.porter import PorterStemmer 

# Initialize empty array 
# to append clean text 
corpus = [] 

# 1000 (reviews) rows to clean 
for i in range(0, 1000): 
	
	# column : "Review", row ith 
	review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) 
	
	# convert all cases to lower cases 
	review = review.lower() 
	
	# split to array(default delimiter is " ") 
	review = review.split() 
	
	# creating PorterStemmer object to 
	# take main stem of each word 
	ps = PorterStemmer() 
	
	# loop for stemming each word 
	# in string array at ith row	 
	review = [ps.stem(word) for word in review 
				if not word in set(stopwords.words('english'))] 
				
	# rejoin all string array elements 
	# to create back into a string 
	review = ' '.join(review) 
	
	# append each string to create 
	# array of clean text 
	corpus.append(review) 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Step 3: Tokenization, involves splitting sentences and words from the body of the text.**

**Step 4: Making the bag of words via sparse matrix**

In [0]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 

# To extract max 1500 feature. 
# "max_features" is attribute to 
# experiment with to get better results 
cv = CountVectorizer(max_features = 1500) 

# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray() 

# y contains answers if review 
# is positive or negative 
y = dataset.iloc[:, 1].values 


**Step 5 : Splitting Corpus into Training and Test set.** 

For this, we need class train_test_split from sklearn.cross_validation. Split can be made 70/30 or 80/20 or 85/15 or 75/25, here I choose 75/25 via “test_size”.
X is the bag of words, y is 0 or 1 (positive or negative).

In [0]:
!pip install sklearn.cross_validation

[31mERROR: Could not find a version that satisfies the requirement sklearn.cross_validation (from versions: none)[0m
[31mERROR: No matching distribution found for sklearn.cross_validation[0m


In [0]:
# Splitting the dataset into 
# the Training set and Test set 
# from sklearn.cross_validation import train_test_split { its old not work}
from sklearn.model_selection import train_test_split

# experiment with "test_size" 
# to get better results 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 


**Step 6: Fitting a Predictive Model (here random forest)**

Since Random fored is ensemble model (made of many trees) from sklearn.ensemble, import RandomForestClassifier class
With 501 tree or “n_estimators” and criterion as ‘entropy’
Fit the model via .fit() method with attributes X_train and y_train
filter_none

In [0]:
# Fitting Random Forest Classification 
# to the Training set 
from sklearn.ensemble import RandomForestClassifier 

# n_estimators can be said as number of 
# trees, experiment with n_estimators 
# to get better results 
model = RandomForestClassifier(n_estimators = 501, 
							criterion = 'entropy') 
							
model.fit(X_train, y_train) 


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=501,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**Step 7: Pridicting Final Results via using .predict() method with attribute X_test**

In [0]:
# Predicting the Test set results 
y_pred = model.predict(X_test) 

y_pred 


array([0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1])

**Step 8: To know the accuracy, confusion matrix is needed.**

Confusion Matrix is a 2X2 Matrix.

**TRUE POSITIVE**: measures the proportion of actual positives that are correctly identified.

**TRUE NEGATIVE :** measures the proportion of actual positives that are not correctly identified.

**FALSE POSITIVE :** measures the proportion of actual negatives that are correctly identified.

**FALSE NEGATIVE :** measures the proportion of actual negatives that are not correctly identified.

In [0]:
# Making the Confusion Matrix 
from sklearn.metrics import confusion_matrix 

cm = confusion_matrix(y_test, y_pred) 

cm 


array([[107,  15],
       [ 50,  78]])