In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import BaggingClassifier



## Data loading

In [2]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,review,rating,label3,label4
0,"in my opinion , a movie reviewer's most import...",0.1,0,0
1,"you can watch this movie , that is based on a ...",0.2,0,0
2,"this is asking a lot to believe , and though i...",0.2,0,0
3,no heroes and no story are the main attributes...,0.2,0,0
4,"this is not an art movie , yet i saw it an art...",0.2,0,0
...,...,...,...,...
5001,the conventional wisdom is that movie sequels ...,0.9,2,3
5002,nicolas roeg's mesmerizing 1971 film walkabout...,0.9,2,3
5003,the movie air force one should require a docto...,0.9,2,3
5004,""" well , jones , at least you haven't forgotte...",0.9,2,3


## Feature extraction

In [3]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X_counts = vectorizer.fit_transform(df['review'].values)
X_counts

<5006x41631 sparse matrix of type '<class 'numpy.int64'>'
	with 759861 stored elements in Compressed Sparse Row format>

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
X_tf

<5006x41631 sparse matrix of type '<class 'numpy.float64'>'
	with 759861 stored elements in Compressed Sparse Row format>

## Training data

In [5]:
Y_label3 = df['label3'].values
Y_label4 = df['label4'].values
Y_label3_names = np.unique(Y_label3).tolist()
Y_label4_names = np.unique(Y_label4).tolist()

In [6]:
X_counts_train, X_counts_test, Y_label3_counts_train, Y_label3_counts_test = train_test_split(X_counts, Y_label3, test_size=0.1)
X_tf_train, X_tf_test, Y_label3_tf_train, Y_label3_tf_test = train_test_split(X_tf, Y_label3, test_size=0.1)

## SVM - auto

In [7]:
n_estimators = 10
clf_svm_counts = SVC(kernel='linear').fit(X_counts_train, Y_label3_counts_train)
clf_svm_tf = SVC(kernel='linear').fit(X_tf_train, Y_label3_tf_train)

In [8]:
pred_svm_counts = clf_svm_counts.predict(X_counts_test)
pred_svm_tf = clf_svm_tf.predict(X_tf_test)

In [9]:
print("X_counts - Y_label3: "+ str(np.mean(pred_svm_counts == Y_label3_counts_test)))
print("X_tf - Y_label3: "+ str(np.mean(pred_svm_tf == Y_label3_tf_test)))

X_counts - Y_label3: 0.6087824351297405
X_tf - Y_label3: 0.688622754491018


In [10]:
print(metrics.classification_report(Y_label3_counts_test, pred_svm_counts, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_counts_test, pred_svm_counts))

              precision    recall  f1-score   support

           0       0.57      0.60      0.58       121
           1       0.54      0.53      0.53       196
           2       0.71      0.70      0.71       184

    accuracy                           0.61       501
   macro avg       0.61      0.61      0.61       501
weighted avg       0.61      0.61      0.61       501

[[ 73  42   6]
 [ 47 103  46]
 [  9  46 129]]


In [11]:
print(metrics.classification_report(Y_label3_tf_test, pred_svm_tf, labels=Y_label3_names))
print(metrics.confusion_matrix(Y_label3_tf_test, pred_svm_tf))

              precision    recall  f1-score   support

           0       0.68      0.48      0.56       126
           1       0.58      0.72      0.64       186
           2       0.84      0.79      0.82       189

    accuracy                           0.69       501
   macro avg       0.70      0.67      0.67       501
weighted avg       0.70      0.69      0.69       501

[[ 61  62   3]
 [ 26 134  26]
 [  3  36 150]]


## SVM - with GridSearch - linear

In [12]:
tuned_parameters_linear = [{'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_linear, scoring=scoring,
                  cv=2, verbose=3, refit='f1')


In [13]:
clf.fit(X_counts_train, Y_label3_counts_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.1, kernel=linear, accuracy=0.616, f1=0.609, precision=0.618, recall=0.604, total=  34.0s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   33.9s remaining:    0.0s


[CV]  C=0.1, kernel=linear, accuracy=0.631, f1=0.624, precision=0.630, recall=0.621, total=  33.2s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV]  C=1, kernel=linear, accuracy=0.611, f1=0.605, precision=0.610, recall=0.601, total=  33.7s
[CV] C=1, kernel=linear ..............................................
[CV]  C=1, kernel=linear, accuracy=0.633, f1=0.627, precision=0.631, recall=0.624, total=  33.9s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.611, f1=0.605, precision=0.610, recall=0.601, total=  32.7s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.633, f1=0.627, precision=0.631, recall=0.624, total=  33.1s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.611, f1=0.605, precision=0.610, recall=0.601, total=  33.7s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.633, f1=0.627, precision=0.631, recall=0.624, total=  35.9s
[CV] C=1000, kernel=linear ..............................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  5.7min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [14]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print(clf.best_score_)

Best parameters set found on development set:
{'C': 0.1, 'kernel': 'linear'}
0.616851425944893


In [15]:
pred = clf.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred))
print(metrics.confusion_matrix(Y_label3_counts_test, pred))

              precision    recall  f1-score   support

           0       0.57      0.60      0.59       121
           1       0.54      0.52      0.53       196
           2       0.72      0.72      0.72       184

    accuracy                           0.61       501
   macro avg       0.61      0.61      0.61       501
weighted avg       0.61      0.61      0.61       501

[[ 73  42   6]
 [ 48 102  46]
 [  7  45 132]]


## SVM - with GridSearch - rbf

In [16]:
tuned_parameters_rbf = [{'kernel': ['rbf'], 'gamma': [10, 1, 1e-1, 1e-2, 1e-3]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [17]:
clf.fit(X_counts_train, Y_label3_counts_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  gamma=10, kernel=rbf, accuracy=0.386, f1=0.194, precision=0.794, recall=0.338, total=  40.6s
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.5s remaining:    0.0s


[CV]  gamma=10, kernel=rbf, accuracy=0.386, f1=0.195, precision=0.794, recall=0.339, total=  39.4s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  gamma=1, kernel=rbf, accuracy=0.386, f1=0.194, precision=0.794, recall=0.338, total=  38.5s
[CV] gamma=1, kernel=rbf .............................................
[CV]  gamma=1, kernel=rbf, accuracy=0.386, f1=0.195, precision=0.794, recall=0.339, total=  37.3s
[CV] gamma=0.1, kernel=rbf ...........................................
[CV]  gamma=0.1, kernel=rbf, accuracy=0.387, f1=0.198, precision=0.747, recall=0.340, total=  35.9s
[CV] gamma=0.1, kernel=rbf ...........................................
[CV]  gamma=0.1, kernel=rbf, accuracy=0.398, f1=0.227, precision=0.646, recall=0.350, total=  35.6s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.496, f1=0.393, precision=0.620, recall=0.440, total=  40.0s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.480, f1=0.385, precision=0.606, recall=0.428, total=  39.3s
[CV] gamma=0.001, kernel=rbf ..................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.3min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'gamma': [10, 1, 0.1, 0.01, 0.001],
                          'kernel': ['rbf']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [18]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'gamma': 0.001, 'kernel': 'rbf'}


In [19]:
print("Detailed classification report:")
pred = clf.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred))
print(metrics.confusion_matrix(Y_label3_counts_test, pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.78      0.26      0.40       121
           1       0.52      0.78      0.62       196
           2       0.76      0.67      0.71       184

    accuracy                           0.62       501
   macro avg       0.68      0.57      0.58       501
weighted avg       0.67      0.62      0.60       501

[[ 32  84   5]
 [  8 153  35]
 [  1  59 124]]


## SVM - poly

In [20]:
tuned_parameters_rbf = [{'kernel': ['poly'], 'degree': [1, 2, 3, 4, 5]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [21]:
clf.fit(X_counts_train, Y_label3_counts_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  degree=1, kernel=poly, accuracy=0.604, f1=0.574, precision=0.642, recall=0.568, total=  35.9s
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.8s remaining:    0.0s


[CV]  degree=1, kernel=poly, accuracy=0.621, f1=0.593, precision=0.666, recall=0.585, total=  34.9s
[CV] degree=2, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  degree=2, kernel=poly, accuracy=0.530, f1=0.453, precision=0.640, recall=0.479, total=  23.9s
[CV] degree=2, kernel=poly ...........................................
[CV]  degree=2, kernel=poly, accuracy=0.512, f1=0.446, precision=0.600, recall=0.466, total=  20.9s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.440, f1=0.313, precision=0.563, recall=0.391, total=  21.0s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.429, f1=0.316, precision=0.538, recall=0.384, total=  21.6s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.402, f1=0.237, precision=0.566, recall=0.355, total=  20.6s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.397, f1=0.246, precision=0.536, recall=0.353, total=  20.7s
[CV] degree=5, kernel=poly ..................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.0min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'degree': [1, 2, 3, 4, 5], 'kernel': ['poly']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [22]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'degree': 1, 'kernel': 'poly'}


In [23]:
print("Detailed classification report:")
pred = clf.predict(X_counts_test)
print(metrics.classification_report(Y_label3_counts_test, pred))
print(metrics.confusion_matrix(Y_label3_counts_test, pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.69      0.44      0.54       121
           1       0.55      0.71      0.62       196
           2       0.76      0.70      0.73       184

    accuracy                           0.64       501
   macro avg       0.67      0.62      0.63       501
weighted avg       0.66      0.64      0.64       501

[[ 53  64   4]
 [ 20 140  36]
 [  4  52 128]]


## SVM - with GridSearch - linear TFIDF

In [24]:
tuned_parameters_linear = [{'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_linear, scoring=scoring,
                  cv=2, verbose=3, refit='f1')


In [25]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.0s remaining:    0.0s


[CV]  C=0.1, kernel=linear, accuracy=0.467, f1=0.324, precision=0.429, recall=0.407, total=  19.1s
[CV] C=0.1, kernel=linear ............................................
[CV]  C=0.1, kernel=linear, accuracy=0.468, f1=0.326, precision=0.424, recall=0.408, total=  19.9s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   38.9s remaining:    0.0s


[CV]  C=1, kernel=linear, accuracy=0.650, f1=0.635, precision=0.668, recall=0.625, total=  17.0s
[CV] C=1, kernel=linear ..............................................
[CV]  C=1, kernel=linear, accuracy=0.634, f1=0.620, precision=0.656, recall=0.609, total=  17.5s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.636, f1=0.633, precision=0.634, recall=0.632, total=  19.0s
[CV] C=10, kernel=linear .............................................
[CV]  C=10, kernel=linear, accuracy=0.624, f1=0.616, precision=0.623, recall=0.612, total=  20.0s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.636, f1=0.632, precision=0.633, recall=0.631, total=  18.9s
[CV] C=100, kernel=linear ............................................
[CV]  C=100, kernel=linear, accuracy=0.624, f1=0.616, precision=0.623, recall=0.612, total=  19.0s
[CV] C=1000, kernel=linear ..............................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.1min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [26]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print(clf.best_score_)

Best parameters set found on development set:
{'C': 1, 'kernel': 'linear'}
0.6275176177163068


In [27]:
pred = clf.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred))
print(metrics.confusion_matrix(Y_label3_tf_test, pred))

              precision    recall  f1-score   support

           0       0.68      0.48      0.56       126
           1       0.58      0.72      0.64       186
           2       0.84      0.79      0.82       189

    accuracy                           0.69       501
   macro avg       0.70      0.67      0.67       501
weighted avg       0.70      0.69      0.69       501

[[ 61  62   3]
 [ 26 134  26]
 [  3  36 150]]


## SVM - with GridSearch - rbf TFIDF

In [28]:
tuned_parameters_rbf = [{'kernel': ['rbf'], 'gamma': [10, 1, 1e-1, 1e-2, 1e-3]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [29]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  gamma=10, kernel=rbf, accuracy=0.391, f1=0.203, precision=0.744, recall=0.341, total=  20.1s
[CV] gamma=10, kernel=rbf ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.0s remaining:    0.0s


[CV]  gamma=10, kernel=rbf, accuracy=0.391, f1=0.201, precision=0.795, recall=0.341, total=  20.0s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   40.1s remaining:    0.0s


[CV]  gamma=1, kernel=rbf, accuracy=0.611, f1=0.567, precision=0.680, recall=0.566, total=  20.0s
[CV] gamma=1, kernel=rbf .............................................
[CV]  gamma=1, kernel=rbf, accuracy=0.623, f1=0.589, precision=0.674, recall=0.583, total=  20.4s
[CV] gamma=0.1, kernel=rbf ...........................................


  _warn_prf(average, modifier, msg_start, len(result))


[CV]  gamma=0.1, kernel=rbf, accuracy=0.525, f1=0.402, precision=0.379, recall=0.459, total=  18.8s
[CV] gamma=0.1, kernel=rbf ...........................................
[CV]  gamma=0.1, kernel=rbf, accuracy=0.543, f1=0.414, precision=0.380, recall=0.474, total=  19.9s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.384, f1=0.185, precision=0.128, recall=0.333, total=  18.8s
[CV] gamma=0.01, kernel=rbf ..........................................
[CV]  gamma=0.01, kernel=rbf, accuracy=0.384, f1=0.185, precision=0.128, recall=0.333, total=  18.9s
[CV] gamma=0.001, kernel=rbf .........................................
[CV]  gamma=0.001, kernel=rbf, accuracy=0.384, f1=0.185, precision=0.128, recall=0.333, total=  19.6s
[CV] gamma=0.001, kernel=rbf .........................................
[CV]  gamma=0.001, kernel=rbf, accuracy=0.384, f1=0.185, precision=0.128, recall=0.333, total=  20.2s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.3min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'gamma': [10, 1, 0.1, 0.01, 0.001],
                          'kernel': ['rbf']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [30]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'gamma': 1, 'kernel': 'rbf'}


In [31]:
print("Detailed classification report:")
pred = clf.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred))
print(metrics.confusion_matrix(Y_label3_tf_test, pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.75      0.30      0.43       126
           1       0.54      0.81      0.65       186
           2       0.83      0.76      0.79       189

    accuracy                           0.66       501
   macro avg       0.71      0.62      0.62       501
weighted avg       0.70      0.66      0.65       501

[[ 38  83   5]
 [ 11 150  25]
 [  2  43 144]]


## SVM - poly TFIDF

In [32]:
tuned_parameters_rbf = [{'kernel': ['poly'], 'degree': [1, 2, 3, 4, 5]}]
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}
clf = GridSearchCV(SVC(), tuned_parameters_rbf, scoring=scoring,
                  cv=2, verbose=3, refit='f1')

In [33]:
clf.fit(X_tf_train, Y_label3_tf_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  degree=1, kernel=poly, accuracy=0.650, f1=0.635, precision=0.668, recall=0.625, total=  19.9s
[CV] degree=1, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.8s remaining:    0.0s


[CV]  degree=1, kernel=poly, accuracy=0.633, f1=0.619, precision=0.655, recall=0.609, total=  18.6s
[CV] degree=2, kernel=poly ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   38.4s remaining:    0.0s


[CV]  degree=2, kernel=poly, accuracy=0.613, f1=0.573, precision=0.669, recall=0.570, total=  20.2s
[CV] degree=2, kernel=poly ...........................................
[CV]  degree=2, kernel=poly, accuracy=0.623, f1=0.589, precision=0.665, recall=0.583, total=  21.4s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.542, f1=0.454, precision=0.625, recall=0.485, total=  21.7s
[CV] degree=3, kernel=poly ...........................................
[CV]  degree=3, kernel=poly, accuracy=0.577, f1=0.495, precision=0.617, recall=0.519, total=  21.2s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.487, f1=0.376, precision=0.627, recall=0.430, total=  20.3s
[CV] degree=4, kernel=poly ...........................................
[CV]  degree=4, kernel=poly, accuracy=0.548, f1=0.447, precision=0.604, recall=0.486, total=  20.1s
[CV] degree=5, kernel=poly ..................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.4min finished


GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'degree': [1, 2, 3, 4, 5], 'kernel': ['poly']}],
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)},
             verbose=3)

In [34]:
print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'degree': 1, 'kernel': 'poly'}


In [35]:
print("Detailed classification report:")
pred = clf.predict(X_tf_test)
print(metrics.classification_report(Y_label3_tf_test, pred))
print(metrics.confusion_matrix(Y_label3_tf_test, pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.68      0.48      0.56       126
           1       0.58      0.72      0.64       186
           2       0.84      0.79      0.82       189

    accuracy                           0.69       501
   macro avg       0.70      0.67      0.67       501
weighted avg       0.70      0.69      0.69       501

[[ 61  62   3]
 [ 26 134  26]
 [  3  36 150]]
