In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')

In [2]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

In [3]:
data = TfidfVectorizer(max_features=1600).fit_transform(corpus.content)
data.shape

(1000, 1600)

In [4]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

model = MultinomialNB()

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.794815845342
0.685354835484
0.960788690476


In [5]:
model = svm.LinearSVC()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.770867539044
0.692515251525
0.817385912698


## Grid Search

In [6]:
c_range = np.logspace(-3,4,8)
gamma = np.logspace(-2,-1,2)
param_grid = [
    {'kernel': ['rbf', 'sigmoid'], 'gamma': gamma, 'C': c_range},
    {'kernel': ['linear'], 'C': c_range},
]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] .. C=0.001, gamma=0.01, kernel=rbf, score=0.633663, total=   3.4s
[CV] .. C=0.001, gamma=0.01, kernel=rbf, score=0.633663, total=   3.4s
[CV] .. C=0.001, gamma=0.01, kernel=rbf, score=0.633663, total=   3.3s
[CV] C=0.001, 

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:   12.9s


[CV]  C=0.001, gamma=0.01, kernel=sigmoid, score=0.636364, total=   3.6s
[CV]  C=0.001, gamma=0.01, kernel=sigmoid, score=0.636364, total=   3.6s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV]  C=0.001, gamma=0.01, kernel=sigmoid, score=0.636364, total=   3.5s
[CV]  C=0.001, gamma=0.01, kernel=sigmoid, score=0.636364, total=   3.6s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ... C=0.001, gamma=0.1, kernel=rbf, score=0.630000, total=   2.7s
[CV] ... C=0.001, gamma=0.1, kernel=rbf, score=0.633663, total=   2.8s
[CV] C=0.001, gamma=0.1, kernel=sigmoid ..............................
[CV] ... C=0.001, gamma=0.1, kernel=rbf, score=0.633663, total=   2.8s
[CV] C=0.001, gamma=0.1, kernel=sigmoid ..............................
[CV] C=0.001, gamma=0.1, kernel=sigmoid .............................

[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.630000, total=   2.7s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.630000, total=   2.7s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.636364, total=   2.7s
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.636364, total=   2.6s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.636364, total=   2.7s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.1, kernel=sigmoid, score=0.636364, total=   2.6s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.633663, total=   2.9s
[CV] C

[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:  1.1min


[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.633663, total=   3.2s
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.633663, total=   3.3s
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.633663, total=   3.3s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.633663, total=   3.4s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.630000, total=   3.3s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.630000, total=   3.3s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] . C=0.1, gamma=0.1, kernel=sigmoid, score=0.636364, total=   3.3s
[CV] C=1.0, gamma=0.01, kernel=rbf ...................................
[CV] .

[CV] ... C=10.0, gamma=0.01, kernel=rbf, score=0.646465, total=   3.0s
[CV] C=10.0, gamma=0.01, kernel=sigmoid ..............................
[CV] C=10.0, gamma=0.01, kernel=sigmoid ..............................
[CV] C=10.0, gamma=0.01, kernel=sigmoid ..............................
[CV]  C=10.0, gamma=0.01, kernel=sigmoid, score=0.633663, total=   2.9s
[CV]  C=10.0, gamma=0.01, kernel=sigmoid, score=0.633663, total=   2.9s
[CV]  C=10.0, gamma=0.01, kernel=sigmoid, score=0.633663, total=   3.0s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV]  C=10.0, gamma=0.01, kernel=sigmoid, score=0.633663, total=   3.0s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[CV]  C=10.0, gamma=0.01, kernel=sigmoid, score=0.630000, total=   3.0s
[CV] C=10.0, gamma=0.1, kernel=rbf ...................................
[

[CV] C=100.0, gamma=0.1, kernel=sigmoid ..............................
[CV] ... C=100.0, gamma=0.1, kernel=rbf, score=0.676768, total=   3.4s
[CV] C=100.0, gamma=0.1, kernel=sigmoid ..............................
[CV] ... C=100.0, gamma=0.1, kernel=rbf, score=0.737374, total=   3.5s
[CV] ... C=100.0, gamma=0.1, kernel=rbf, score=0.717172, total=   3.4s
[CV] C=100.0, gamma=0.1, kernel=sigmoid ..............................
[CV] C=100.0, gamma=0.1, kernel=sigmoid ..............................
[CV] ... C=100.0, gamma=0.1, kernel=rbf, score=0.727273, total=   3.5s
[CV] C=100.0, gamma=0.1, kernel=sigmoid ..............................
[CV]  C=100.0, gamma=0.1, kernel=sigmoid, score=0.564356, total=   3.4s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV]  C=100.0, gamma=0.1, kernel=sigmoid, score=0.613861, total=   3.4s
[CV] C=1000.0, gamma=0.01, kernel=rbf ................................
[CV]  C=100.0, gamma=0.1, kernel=sigmoid, score=0.584158, total=   3.4s
[CV

[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:  2.7min


[CV] .. C=1000.0, gamma=0.1, kernel=rbf, score=0.707071, total=   3.5s
[CV] C=1000.0, gamma=0.1, kernel=sigmoid .............................
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.564356, total=   3.4s
[CV] C=10000.0, gamma=0.01, kernel=rbf ...............................
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.574257, total=   3.5s
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.594059, total=   3.5s
[CV] C=10000.0, gamma=0.01, kernel=rbf ...............................
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.643564, total=   3.4s
[CV] C=10000.0, gamma=0.01, kernel=rbf ...............................
[CV] C=10000.0, gamma=0.01, kernel=rbf ...............................
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.670000, total=   3.5s
[CV] C=10000.0, gamma=0.01, kernel=rbf ...............................
[CV]  C=1000.0, gamma=0.1, kernel=sigmoid, score=0.680000, total=   3.5s
[CV] C=10000.0, gamma=0.01, kernel=rbf ..........................

[CV] C=0.01, kernel=linear ...........................................
[CV] ........... C=0.001, kernel=linear, score=0.636364, total=   3.6s
[CV] C=0.01, kernel=linear ...........................................
[CV] ........... C=0.001, kernel=linear, score=0.636364, total=   3.7s
[CV] C=0.01, kernel=linear ...........................................
[CV] ........... C=0.001, kernel=linear, score=0.636364, total=   3.9s
[CV] C=0.01, kernel=linear ...........................................
[CV] ............ C=0.01, kernel=linear, score=0.633663, total=   4.0s
[CV] C=0.1, kernel=linear ............................................
[CV] ............ C=0.01, kernel=linear, score=0.633663, total=   4.4s
[CV] C=0.1, kernel=linear ............................................
[CV] ............ C=0.01, kernel=linear, score=0.633663, total=   4.3s
[CV] C=0.1, kernel=linear ............................................
[CV] ............ C=0.01, kernel=linear, score=0.633663, total=   4.4s
[CV] C

[CV] C=10000.0, kernel=linear ........................................
[CV] .......... C=1000.0, kernel=linear, score=0.680000, total=   3.4s
[CV] C=10000.0, kernel=linear ........................................
[CV] .......... C=1000.0, kernel=linear, score=0.717172, total=   3.4s
[CV] .......... C=1000.0, kernel=linear, score=0.656566, total=   3.4s
[CV] C=10000.0, kernel=linear ........................................
[CV] C=10000.0, kernel=linear ........................................
[CV] .......... C=1000.0, kernel=linear, score=0.707071, total=   3.5s
[CV] C=10000.0, kernel=linear ........................................
[CV] ......... C=10000.0, kernel=linear, score=0.564356, total=   3.3s
[CV] C=10000.0, kernel=linear ........................................
[CV] .......... C=1000.0, kernel=linear, score=0.686869, total=   4.0s
[CV] ......... C=10000.0, kernel=linear, score=0.643564, total=   3.3s
[CV] ......... C=10000.0, kernel=linear, score=0.574257, total=   3.3s
[CV] .

[Parallel(n_jobs=10)]: Done 400 out of 400 | elapsed:  3.9min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'kernel': ['rbf', 'sigmoid'], 'gamma': array([ 0.01,  0.1 ]), 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])}, {'kernel': ['linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.10000000000000001,
  kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.708
{'C': 10.0, 'gamma': 0.10000000000000001, 'kernel': 'sigmoid'}


In [8]:
model = svm.SVC(kernel='rbf',C=10,gamma=0.1)

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.766047018667
0.685544954495
0.814310515873


In [9]:
nb_params = { 'alpha': np.logspace(-3, 3, 7) }

grid_search = GridSearchCV(MultinomialNB(), nb_params, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.603960, total=   0.0s
[CV] ...................... alpha=0.001, score=0.633663, total=   0.0s
[CV] ...................... alpha=0.001, score=0.690000, total=   0.0s
[CV] ...................... alpha=0.001, score=0.787879, total=   0.0s
[CV] ...................... alpha=0.001, score=0.673267, total=   0.0s
[CV] ...........

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.2s


[CV] ........................ alpha=1.0, score=0.727273, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] ........................ alpha=1.0, score=0.707071, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] ........................ alpha=0.1, score=0.777778, total=   0.0s
[CV] ........................ alpha=1.0, score=0.633663, total=   0.0s
[CV] ........................ alpha=1.0, score=0.643564, total=   0.0s
[CV] ....................... alpha=10.0, score=0.633663, total=   0.0s
[CV] ....................... alpha=10.0, score=0.633663, total=   0.1s
[CV] ........................ alpha=1.0, score=0.680000, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] alpha=10.0 ......................................................
[CV] ........................ alpha=1.0, score=0.757576, total=   0.0s
[CV] a

[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    0.7s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [10]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
0.719
{'alpha': 0.01}


In [11]:
from sklearn.naive_bayes import BernoulliNB

nb_params = { 'alpha': np.logspace(-3, 3, 7) }

grid_search = GridSearchCV(BernoulliNB(), nb_params, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.603960, total=   0.1s
[CV] ...................... alpha=0.001, score=0.554455, total=   0.1s
[CV] ...................... alpha=0.001, score=0.633663, total=   0.0s
[CV] ...................... alpha=0.001, score=0.564356, total=   0.0s
[CV] ...................... alpha=0.001, score=0.680000, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.750000, total=   0.0s
[CV] alpha=0.001

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.3s


[CV] alpha=10.0 ......................................................
[CV] ........................ alpha=1.0, score=0.747475, total=   0.0s
[CV] alpha=10.0 ......................................................
[CV] ........................ alpha=1.0, score=0.808081, total=   0.0s
[CV] ........................ alpha=1.0, score=0.544554, total=   0.0s
[CV] ........................ alpha=1.0, score=0.680000, total=   0.0s
[CV] alpha=10.0 ......................................................
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] ....................... alpha=10.0, score=0.594059, total=   0.0s
[CV] alpha=10.0 ......................................................
[CV] ....................... alpha=10.0, score=0.623762, total=   0.0s
[CV] alpha=10.0 ......................................................
[CV] a

[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    0.8s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [12]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)
0.694
{'alpha': 0.001}


In [13]:
from sklearn.tree import DecisionTreeClassifier 

model = DecisionTreeClassifier()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.671140458999
0.607078507851
0.703645833333


In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
print(f1)
print(acc)
print(recall)
print(precision)

0.763598622633
0.667353035304
0.87876984127
0.672976601236


In [15]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.740469722352
0.646611461146
0.777678571429


In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.772567154278
0.672304130413
0.88033234127


In [4]:
from sklearn.tree import ExtraTreeClassifier

model = ExtraTreeClassifier()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.670851810375
0.589218521852
0.627951388889
