In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import BaggingClassifier



## Data loading

In [3]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,review,rating,label3,label4
0,"in my opinion , a movie reviewer's most import...",0.1,0,0
1,"you can watch this movie , that is based on a ...",0.2,0,0
2,"this is asking a lot to believe , and though i...",0.2,0,0
3,no heroes and no story are the main attributes...,0.2,0,0
4,"this is not an art movie , yet i saw it an art...",0.2,0,0
...,...,...,...,...
5001,the conventional wisdom is that movie sequels ...,0.9,2,3
5002,nicolas roeg's mesmerizing 1971 film walkabout...,0.9,2,3
5003,the movie air force one should require a docto...,0.9,2,3
5004,""" well , jones , at least you haven't forgotte...",0.9,2,3


## Feature extraction

In [4]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X_counts = vectorizer.fit_transform(df['review'].values)
X_counts

<5006x41631 sparse matrix of type '<class 'numpy.int64'>'
	with 759861 stored elements in Compressed Sparse Row format>

In [5]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
X_tf

<5006x41631 sparse matrix of type '<class 'numpy.float64'>'
	with 759861 stored elements in Compressed Sparse Row format>

## Training data

In [6]:
Y_label3 = df['label3'].values
Y_label3_names = np.unique(Y_label3).tolist()

## Naive Bayes - auto

In [7]:
clf_nb_counts = cross_val_score(MultinomialNB(), X_counts, Y_label3, cv=10)
clf_nb_tf = cross_val_score(MultinomialNB(), X_tf, Y_label3, cv=10)
print("Using CountVectorizer")
print(clf_nb_counts)
print("Accuracy: %0.2f (+/- %0.2f)" % (clf_nb_counts.mean(), clf_nb_counts.std() * 2))
print("Using TfidfVectorizer")
print(clf_nb_tf)
print("Accuracy: %0.2f (+/- %0.2f)" % (clf_nb_tf.mean(), clf_nb_tf.std() * 2))

Using CountVectorizer
[0.60479042 0.500998   0.51896208 0.6506986  0.62075848 0.55688623
 0.584      0.568      0.63       0.602     ]
Accuracy: 0.58 (+/- 0.09)
Using TfidfVectorizer
[0.56287425 0.44111776 0.6247505  0.57884232 0.49700599 0.52694611
 0.512      0.562      0.544      0.51      ]
Accuracy: 0.54 (+/- 0.10)


## Naive Bayes - Grid Search

In [8]:
tuned_parameters ={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(MultinomialNB(), tuned_parameters, scoring=scoring,
                  cv=10, verbose=3, refit='f1')

In [11]:
clf.fit(X_counts, Y_label3)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.555, f1=0.555, precision=0.558, recall=0.551, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.471, f1=0.465, precision=0.490, recall=0.459, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.515, f1=0.489, precision=0.502, recall=0.485, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.619, f1=0.599, precision=0.626, recall=0.593, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.569, f1=0.552, precision=0.632, recall=0.542, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.523, f1=0.501, precision=0.521, recall=

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  alpha=0.01, accuracy=0.534, f1=0.534, precision=0.538, recall=0.543, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.560, f1=0.552, precision=0.555, recall=0.549, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV]  alpha=0.01, accuracy=0.562, f1=0.550, precision=0.554, recall=0.555, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV]  alpha=0.1, accuracy=0.559, f1=0.560, precision=0.560, recall=0.562, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV]  alpha=0.1, accuracy=0.507, f1=0.509, precision=0.521, recall=0.506, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV]  alpha=0.1, accuracy=0.497, f1=0.482, precision=0.491, recall=0.477, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV]  alpha=0.1, accuracy=0.649, f1=0.635, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished


GridSearchCV(cv=10, estimator=MultinomialNB(),
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}, refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [60]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'alpha': 0.5}


In [61]:
best_clf = MultinomialNB(alpha=0.5)
cvl = cross_val_score(best_clf, X_counts, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.57085828 0.50698603 0.51896208 0.67265469 0.6247505  0.53892216
 0.572      0.544      0.61       0.596     ]
Accuracy: 0.58 (+/- 0.10)


## Naive Bayes - different types

In [41]:
cvl = cross_val_score(MultinomialNB(), X_counts, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.60479042 0.500998   0.51896208 0.6506986  0.62075848 0.55688623
 0.584      0.568      0.63       0.602     ]
Accuracy: 0.58 (+/- 0.09)


In [43]:
cvl = cross_val_score(ComplementNB(), X_counts, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.5748503  0.50499002 0.51297405 0.65469062 0.61277445 0.52894212
 0.548      0.538      0.586      0.574     ]
Accuracy: 0.56 (+/- 0.09)


In [45]:
cvl = cross_val_score(BernoulliNB(), X_counts, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.55888224 0.56087824 0.55888224 0.65469062 0.5748503  0.53892216
 0.546      0.552      0.652      0.618     ]
Accuracy: 0.58 (+/- 0.08)


## Naive Bayes - different types with TFIDF

In [46]:
cvl = cross_val_score(MultinomialNB(), X_tf, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.56287425 0.44111776 0.6247505  0.57884232 0.49700599 0.52694611
 0.512      0.562      0.544      0.51      ]
Accuracy: 0.54 (+/- 0.10)


In [47]:
cvl = cross_val_score(ComplementNB(), X_tf, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.57684631 0.43113772 0.61277445 0.52894212 0.46906188 0.50299401
 0.522      0.55       0.54       0.478     ]
Accuracy: 0.52 (+/- 0.10)


In [12]:
cvl = cross_val_score(BernoulliNB(), X_tf, Y_label3, cv=10)
print("Scores")
print(cvl)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvl.mean(), cvl.std() * 2))

Scores
[0.55888224 0.56087824 0.55888224 0.65469062 0.5748503  0.53892216
 0.546      0.552      0.652      0.618     ]
Accuracy: 0.58 (+/- 0.08)
