In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [10]:
dataset = pd.read_csv('Reviews_Movies.csv', delimiter = '\t', quoting = 3)

In [11]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.ensemble import AdaBoostClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ppercca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
dataset=dataset[:1000]

In [13]:
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
len(corpus)

1000

In [15]:
corpus[1]

'stori man unnatur feel pig start open scene terrif exampl absurd comedi formal orchestra audienc turn insan violent mob crazi chant singer unfortun stay absurd whole time gener narr eventu make put even era turn cryptic dialogu would make shakespear seem easi third grader technic level better might think good cinematographi futur great vilmo zsigmond futur star salli kirkland freder forrest seen briefli'

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [17]:
type(X)

numpy.ndarray

# Random Forest

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [20]:
clf = RandomForestClassifier(n_estimators = 100, criterion = "entropy")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

In [21]:
cm

array([[85, 10],
       [22, 83]])

In [22]:
acc

0.84

# naive bayes

In [23]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [24]:
y_pred = classifier.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [26]:
cm

array([[84, 11],
       [14, 91]])

In [27]:
acc = accuracy_score(y_test, y_pred)
acc

0.875

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = []
models.append(('Logistic regression', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Random forest', RandomForestClassifier(n_estimators = 100, 
                                                       criterion = 'entropy')))
models.append(('NB', GaussianNB()))
models.append(('KernelSVM', SVC(kernel='rbf') ))
models.append(('AdaBoost', AdaBoostClassifier(random_state=1)))

for name, model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    print ("---------- model: " + name + '-------------------')
    print("Acurracy: " + str(accuracy_score(y_test, y_pred)))
    TP, TN, FP, FN = cm[1][1], cm[0][0], cm[0][1], cm[1][0]
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2* precision * recall/(precision+recall)
    print("Precision: " + str( round(precision,2)) )
    print("Recall: " + str(round(recall,2)) )
    print("F1: " + str(round(F1,2)))

---------- model: Logistic regression-------------------
Acurracy: 0.85
Precision: 0.87
Recall: 0.84
F1: 0.85
---------- model: KNN-------------------
Acurracy: 0.64
Precision: 0.72
Recall: 0.51
F1: 0.6
---------- model: Random forest-------------------
Acurracy: 0.815
Precision: 0.82
Recall: 0.83
F1: 0.82
---------- model: NB-------------------
Acurracy: 0.875
Precision: 0.89
Recall: 0.87
F1: 0.88
---------- model: KernelSVM-------------------
Acurracy: 0.495
Precision: 0.83
Recall: 0.05
F1: 0.09
---------- model: AdaBoost-------------------
Acurracy: 0.78
Precision: 0.8
Recall: 0.78
F1: 0.79
