In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import BaggingClassifier



## Data loading

In [2]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,review,rating,label3,label4
0,"in my opinion , a movie reviewer's most import...",0.1,0,0
1,"you can watch this movie , that is based on a ...",0.2,0,0
2,"this is asking a lot to believe , and though i...",0.2,0,0
3,no heroes and no story are the main attributes...,0.2,0,0
4,"this is not an art movie , yet i saw it an art...",0.2,0,0
...,...,...,...,...
5001,the conventional wisdom is that movie sequels ...,0.9,2,3
5002,nicolas roeg's mesmerizing 1971 film walkabout...,0.9,2,3
5003,the movie air force one should require a docto...,0.9,2,3
5004,""" well , jones , at least you haven't forgotte...",0.9,2,3


## Feature extraction

In [3]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X_counts = vectorizer.fit_transform(df['review'].values)
X_counts

<5006x41631 sparse matrix of type '<class 'numpy.int64'>'
	with 759861 stored elements in Compressed Sparse Row format>

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
X_tf

<5006x41631 sparse matrix of type '<class 'numpy.float64'>'
	with 759861 stored elements in Compressed Sparse Row format>

## Training data

In [5]:
Y_label3 = df['label3'].values
Y_label3_names = np.unique(Y_label3).tolist()

In [6]:
X_counts_train, X_counts_test, Y_label3_counts_train, Y_label3_counts_test = train_test_split(X_counts, Y_label3, test_size=0.1)
X_tf_train, X_tf_test, Y_label3_tf_train, Y_label3_tf_test = train_test_split(X_tf, Y_label3, test_size=0.1)

## SVM - auto

In [9]:
clf_svm_counts = cross_val_score(SVC(kernel='linear'), X_counts, Y_label3, cv=10)
clf_svm_tf = cross_val_score(SVC(kernel='linear'), X_tf, Y_label3, cv=10)

In [10]:
print("Using CountVectorizer")
print(clf_svm_counts)
print("Accuracy: %0.2f (+/- %0.2f)" % (clf_svm_counts.mean(), clf_svm_counts.std() * 2))
print("Using TfidfVectorizer")
print(clf_svm_tf)
print("Accuracy: %0.2f (+/- %0.2f)" % (clf_svm_tf.mean(), clf_svm_tf.std() * 2))

Using CountVectorizer
[0.51696607 0.55289421 0.5489022  0.69261477 0.64271457 0.54491018
 0.56       0.546      0.642      0.614     ]
Accuracy: 0.59 (+/- 0.11)
Using TfidfVectorizer
[0.55289421 0.59680639 0.55489022 0.73053892 0.61876248 0.56886228
 0.598      0.59       0.63       0.608     ]
Accuracy: 0.60 (+/- 0.10)


## SVM - with GridSearch - linear

In [7]:
tuned_parameters_linear = [{'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_linear, scoring=scoring,
                  cv=2, verbose=3, refit='f1')


In [8]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.8s remaining:    0.0s


[CV]  C=0.1, kernel=linear, accuracy=0.436, f1=0.279, precision=0.440, recall=0.381, total=  18.9s
[CV] C=0.1, kernel=linear ............................................
[CV]  C=0.1, kernel=linear, accuracy=0.498, f1=0.377, precision=0.367, recall=0.437, total=  18.7s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   37.5s remaining:    0.0s


[CV]  C=1, kernel=linear, accuracy=0.641, f1=0.629, precision=0.649, recall=0.621, total=  17.3s
[CV] C=1, kernel=linear ..............................................
[CV]  C=1, kernel=linear, accuracy=0.635, f1=0.617, precision=0.652, recall=0.608, total=  18.1s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.626, f1=0.621, precision=0.621, recall=0.621, total=  19.0s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.639, f1=0.630, precision=0.639, recall=0.624, total=  19.3s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.626, f1=0.621, precision=0.621, recall=0.621, total=  19.4s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.639, f1=0.629, precision=0.639, recall=0.624, total=  19.8s
[CV] C=1000, kernel=linear ..............................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.1min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [20]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print(clf.best_score_)

Best parameters set found on development set:
{'degree': 1, 'kernel': 'poly'}
0.623132784874169


In [11]:
cvs = cross_val_score(SVC(kernel='linear', C=10), X_tf, Y_label3, cv=10)
print("Scores for best param")
print(cvs)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

Scores for best param
[0.55489022 0.55489022 0.55289421 0.72255489 0.62674651 0.55688623
 0.614      0.556      0.646      0.614     ]
Accuracy: 0.60 (+/- 0.11)


## SVM - with GridSearch - rbf

In [13]:
tuned_parameters_rbf = [{'kernel': ['rbf'], 'gamma': [10, 1, 1e-1, 1e-2, 1e-3]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [14]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  gamma=10, kernel=rbf, accuracy=0.387, f1=0.198, precision=0.723, recall=0.340, total=  19.1s
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.0s remaining:    0.0s


[CV]  gamma=10, kernel=rbf, accuracy=0.387, f1=0.197, precision=0.739, recall=0.339, total=  18.8s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   37.8s remaining:    0.0s


[CV]  gamma=1, kernel=rbf, accuracy=0.618, f1=0.585, precision=0.655, recall=0.580, total=  19.3s
[CV] gamma=1, kernel=rbf .............................................
[CV]  gamma=1, kernel=rbf, accuracy=0.616, f1=0.571, precision=0.677, recall=0.572, total=  19.4s
[CV] gamma=0.1, kernel=rbf ...........................................


  _warn_prf(average, modifier, msg_start, len(result))


[CV]  gamma=0.1, kernel=rbf, accuracy=0.531, f1=0.406, precision=0.384, recall=0.466, total=  17.9s
[CV] gamma=0.1, kernel=rbf ...........................................
[CV]  gamma=0.1, kernel=rbf, accuracy=0.528, f1=0.403, precision=0.361, recall=0.464, total=  18.2s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.381, f1=0.184, precision=0.127, recall=0.333, total=  18.0s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.381, f1=0.184, precision=0.127, recall=0.333, total=  18.2s
[CV] gamma=0.001, kernel=rbf .........................................
[CV]  gamma=0.001, kernel=rbf, accuracy=0.381, f1=0.184, precision=0.127, recall=0.333, total=  18.0s
[CV] gamma=0.001, kernel=rbf .........................................
[CV]  gamma=0.001, kernel=rbf, accuracy=0.381, f1=0.184, precision=0.127, recall=0.333, total=  18.7s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.1min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'gamma': [10, 1, 0.1, 0.01, 0.001],
                          'kernel': ['rbf']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [15]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print(clf.best_score_)

Best parameters set found on development set:
{'gamma': 1, 'kernel': 'rbf'}
0.5777731862288922


In [16]:
cvs = cross_val_score(SVC(kernel='rbf', gamma=1), X_tf, Y_label3, cv=10)
print("Scores for best param")
print(cvs)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

Scores for best param
[0.55289421 0.5988024  0.60279441 0.7005988  0.5748503  0.5508982
 0.578      0.59       0.598      0.6       ]
Accuracy: 0.59 (+/- 0.08)


## SVM - poly

In [18]:
tuned_parameters_rbf = [{'kernel': ['poly'], 'degree': [1, 2, 3, 4, 5]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [19]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  degree=1, kernel=poly, accuracy=0.641, f1=0.629, precision=0.649, recall=0.621, total=  18.3s
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.2s remaining:    0.0s


[CV]  degree=1, kernel=poly, accuracy=0.635, f1=0.617, precision=0.653, recall=0.609, total=  19.0s
[CV] degree=2, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   37.2s remaining:    0.0s


[CV]  degree=2, kernel=poly, accuracy=0.611, f1=0.580, precision=0.638, recall=0.575, total=  20.5s
[CV] degree=2, kernel=poly ...........................................
[CV]  degree=2, kernel=poly, accuracy=0.613, f1=0.565, precision=0.658, recall=0.568, total=  20.7s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.563, f1=0.488, precision=0.603, recall=0.509, total=  20.3s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.542, f1=0.451, precision=0.580, recall=0.486, total=  20.8s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.513, f1=0.409, precision=0.565, recall=0.455, total=  20.9s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.489, f1=0.386, precision=0.527, recall=0.434, total=  20.9s
[CV] degree=5, kernel=poly ..................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.4min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'degree': [1, 2, 3, 4, 5], 'kernel': ['poly']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [21]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print(clf.best_score_)

Best parameters set found on development set:
{'degree': 1, 'kernel': 'poly'}
0.623132784874169


In [24]:
cvs = cross_val_score(SVC(kernel='poly', degree=1), X_tf, Y_label3, cv=10)
print("Scores for best param")
print(cvs)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

KeyboardInterrupt: 

In [25]:
bi_vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS, ngram_range=(1, 2))
X_counts_bi = bi_vectorizer.fit_transform(df['review'].values)
X_counts_bi

<5006x665764 sparse matrix of type '<class 'numpy.int64'>'
	with 1644608 stored elements in Compressed Sparse Row format>

In [28]:
cvs = cross_val_score(SVC(kernel='linear'), X_counts_bi, Y_label3, cv=10)
print("Scores for best param")
print(cvs)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

Scores for best param
[0.53493014 0.58483034 0.54690619 0.71656687 0.62075848 0.58283433
 0.558      0.584      0.656      0.624     ]
Accuracy: 0.60 (+/- 0.10)
