In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import BaggingClassifier



## Data loading

In [2]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,review,rating,label3,label4
0,"in my opinion , a movie reviewer's most import...",0.1,0,0
1,"you can watch this movie , that is based on a ...",0.2,0,0
2,"this is asking a lot to believe , and though i...",0.2,0,0
3,no heroes and no story are the main attributes...,0.2,0,0
4,"this is not an art movie , yet i saw it an art...",0.2,0,0
...,...,...,...,...
5001,the conventional wisdom is that movie sequels ...,0.9,2,3
5002,nicolas roeg's mesmerizing 1971 film walkabout...,0.9,2,3
5003,the movie air force one should require a docto...,0.9,2,3
5004,""" well , jones , at least you haven't forgotte...",0.9,2,3


## Feature extraction

In [3]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X_counts = vectorizer.fit_transform(df['review'].values)
X_counts

<5006x41631 sparse matrix of type '<class 'numpy.int64'>'
	with 759861 stored elements in Compressed Sparse Row format>

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
X_tf

<5006x41631 sparse matrix of type '<class 'numpy.float64'>'
	with 759861 stored elements in Compressed Sparse Row format>

## Training data

In [5]:
Y_label3 = df['label3'].values
Y_label4 = df['label4'].values
Y_label3_names = np.unique(Y_label3).tolist()
Y_label4_names = np.unique(Y_label4).tolist()

In [7]:
X_counts_train, X_counts_test, Y_label3_counts_train, Y_label3_counts_test = train_test_split(X_counts, Y_label3, test_size=0.1)
X_tf_train, X_tf_test, Y_label3_tf_train, Y_label3_tf_test = train_test_split(X_tf, Y_label3, test_size=0.1)

## Naive Bayes - auto

In [22]:
clf_nb_counts = MultinomialNB(alpha=0.5).fit(X_counts_train, Y_label3_counts_train)
clf_nb_tf = MultinomialNB().fit(X_tf_train, Y_label3_tf_train)

In [23]:
pred_nb_counts = clf_nb_counts.predict(X_counts_test)
pred_nb_tf = clf_nb_tf.predict(X_tf_test)

In [24]:
print("X_counts - Y_label3: "+ str(np.mean(pred_nb_counts == Y_label3_counts_test)))
print("X_tf - Y_label3: "+ str(np.mean(pred_nb_tf == Y_label3_tf_test)))

X_counts - Y_label3: 0.6187624750499002
X_tf - Y_label3: 0.5369261477045908


In [25]:
print(metrics.classification_report(Y_label3_counts_test, pred_nb_counts, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_nb_counts))

              precision    recall  f1-score   support

           0       0.64      0.51      0.57       113
           1       0.54      0.60      0.57       187
           2       0.69      0.69      0.69       201

    accuracy                           0.62       501
   macro avg       0.62      0.60      0.61       501
weighted avg       0.62      0.62      0.62       501

[[ 58  45  10]
 [ 22 113  52]
 [ 10  52 139]]


In [12]:
print(metrics.classification_report(Y_label3_tf_test, pred_nb_tf, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_nb_tf))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       135
           1       0.46      0.68      0.55       186
           2       0.63      0.79      0.70       180

    accuracy                           0.54       501
   macro avg       0.36      0.49      0.42       501
weighted avg       0.40      0.54      0.46       501

[[  0 110  25]
 [  0 127  59]
 [  0  38 142]]


  _warn_prf(average, modifier, msg_start, len(result))


## Naive Bayes - Grid Search

In [12]:
tuned_parameters ={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(MultinomialNB(), tuned_parameters, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [13]:
clf.fit(X_counts_train, Y_label3_counts_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.541, f1=0.532, precision=0.545, recall=0.527, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.525, f1=0.515, precision=0.530, recall=0.509, total=   0.0s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  alpha=0.1, accuracy=0.569, f1=0.568, precision=0.570, recall=0.567, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV]  alpha=0.1, accuracy=0.563, f1=0.558, precision=0.561, recall=0.556, total=   0.0s
[CV] alpha=0.5 .......................................................
[CV]  alpha=0.5, accuracy=0.600, f1=0.593, precision=0.608, recall=0.585, total=   0.0s
[CV] alpha=0.5 .......................................................
[CV]  alpha=0.5, accuracy=0.582, f1=0.572, precision=0.590, recall=0.565, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV]  alpha=1.0, accuracy=0.585, f1=0.553, precision=0.620, recall=0.550, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV]  alpha=1.0, accuracy=0.586, f1=0.557, precision=0.622, recall=0.552, total=   0.0s
[CV] alpha=10.0 ......................................................
[CV]  alpha=10.0, accuracy=0.506, f1=0.374, pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


GridSearchCV(cv=2, estimator=MultinomialNB(),
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}, refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [14]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'alpha': 0.5}


In [15]:
print("Detailed classification report:")
pred = clf.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred))
print(metrics.confusion_matrix(Y_label3_counts_test, pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       112
           1       0.55      0.59      0.57       184
           2       0.72      0.67      0.70       205

    accuracy                           0.63       501
   macro avg       0.64      0.63      0.63       501
weighted avg       0.64      0.63      0.63       501

[[ 70  34   8]
 [ 29 109  46]
 [ 10  57 138]]


## Naive Bayes - different types

In [16]:
clf_multi = MultinomialNB().fit(X_counts_train, Y_label3_counts_train)
pred_multi = clf_multi.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred_multi, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_multi))

              precision    recall  f1-score   support

           0       0.66      0.49      0.56       112
           1       0.54      0.65      0.59       184
           2       0.72      0.69      0.71       205

    accuracy                           0.63       501
   macro avg       0.64      0.61      0.62       501
weighted avg       0.64      0.63      0.63       501

[[ 55  47  10]
 [ 20 119  45]
 [  8  55 142]]


In [17]:
clf_gauss = GaussianNB().fit(X_counts_train.toarray(), Y_label3_counts_train)
pred_gauss = clf_gauss.predict(X_counts_test.toarray())
print(metrics.classification_report(Y_label3_counts_test, pred_gauss, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_gauss))

              precision    recall  f1-score   support

           0       0.39      0.25      0.30       112
           1       0.43      0.52      0.47       184
           2       0.56      0.56      0.56       205

    accuracy                           0.48       501
   macro avg       0.46      0.44      0.44       501
weighted avg       0.47      0.48      0.47       501

[[ 28  57  27]
 [ 24  96  64]
 [ 20  71 114]]


In [18]:
clf_compl = ComplementNB().fit(X_counts_train, Y_label3_counts_train)
pred_compl = clf_compl.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred_compl, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_compl))

              precision    recall  f1-score   support

           0       0.62      0.66      0.64       112
           1       0.58      0.47      0.52       184
           2       0.67      0.76      0.71       205

    accuracy                           0.63       501
   macro avg       0.62      0.63      0.62       501
weighted avg       0.62      0.63      0.62       501

[[ 74  27  11]
 [ 33  86  65]
 [ 13  36 156]]


In [19]:
clf_bern = BernoulliNB().fit(X_counts_train, Y_label3_counts_train)
pred_bern = clf_bern.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred_bern, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_bern))

              precision    recall  f1-score   support

           0       0.75      0.46      0.57       112
           1       0.53      0.77      0.63       184
           2       0.81      0.66      0.73       205

    accuracy                           0.65       501
   macro avg       0.70      0.63      0.64       501
weighted avg       0.69      0.65      0.65       501

[[ 51  60   1]
 [ 13 141  30]
 [  4  66 135]]


## Naive Bayes - different types with TFIDF

In [20]:
clf_multi = MultinomialNB().fit(X_tf_train, Y_label3_tf_train)
pred_multi = clf_multi.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred_multi, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_multi))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.50      0.60      0.55       203
           2       0.60      0.87      0.71       180

    accuracy                           0.55       501
   macro avg       0.37      0.49      0.42       501
weighted avg       0.42      0.55      0.48       501

[[  0  95  23]
 [  0 121  82]
 [  0  24 156]]


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
clf_gauss = GaussianNB().fit(X_tf_train.toarray(), Y_label3_counts_train)
pred_gauss = clf_gauss.predict(X_tf_test.toarray())
print(metrics.classification_report(Y_label3_tf_test, pred_gauss, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_gauss))

              precision    recall  f1-score   support

           0       0.29      0.19      0.23       118
           1       0.44      0.45      0.44       203
           2       0.39      0.46      0.42       180

    accuracy                           0.39       501
   macro avg       0.37      0.37      0.37       501
weighted avg       0.38      0.39      0.39       501

[[23 45 50]
 [32 91 80]
 [25 72 83]]


In [22]:
clf_compl = ComplementNB().fit(X_tf_train, Y_label3_tf_train)
pred_compl = clf_compl.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred_compl, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_compl))

              precision    recall  f1-score   support

           0       1.00      0.01      0.02       118
           1       0.50      0.51      0.50       203
           2       0.56      0.89      0.69       180

    accuracy                           0.53       501
   macro avg       0.68      0.47      0.40       501
weighted avg       0.64      0.53      0.45       501

[[  1  87  30]
 [  0 104  99]
 [  0  19 161]]


In [23]:
clf_bern = BernoulliNB().fit(X_tf_train, Y_label3_tf_train)
pred_bern = clf_bern.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred_bern, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_bern))

              precision    recall  f1-score   support

           0       0.71      0.34      0.46       118
           1       0.54      0.75      0.63       203
           2       0.74      0.67      0.70       180

    accuracy                           0.62       501
   macro avg       0.66      0.59      0.60       501
weighted avg       0.65      0.62      0.62       501

[[ 40  73   5]
 [ 13 152  38]
 [  3  56 121]]
