In [9]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')

In [2]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

In [3]:
data = TfidfVectorizer(max_features=1000).fit_transform(corpus.content)
data.shape

(1064, 1000)

In [4]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.809668290531
0.717016072676
0.952304653205


In [5]:
model = svm.LinearSVC()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.774358821162
0.702061495458
0.811062335382


## Grid Search

In [12]:
c_range = np.logspace(-3,4,8)
gamma = np.logspace(-2,-1,2)
param_grid = [
    {'kernel': ['rbf', 'sigmoid'], 'gamma': gamma, 'C': c_range},
    {'kernel': ['linear'], 'C': c_range},
]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] kernel=rbf, C=0.001, gamma=0.01 .................................
[CV] .. kernel=rbf, C=0.001, gamma=0.01, score=0.629630, total=   2.6s
[CV] kernel=sigmoid, C=0.001, gamma=0.01 .............................
[CV] .. kernel=rbf, C=0.001, gamma=0.01, score=0.629630, total=   2.7s
[CV] kernel=si

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:   10.8s


[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   3.0s
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   2.8s
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   3.0s
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   3.0s
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   2.8s
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   2.9s
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV]  kernel=sigmoid, C=0.001, gamma=0.01, score=0.632075, total=   2.9s
[CV] kernel=rbf, C=0.001, gamma=0.1 ..................................
[CV] kernel=rbf, C=0.001, gamma=0.1 ...........................

[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:  1.1min


[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.629630, total=   3.1s
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.629630, total=   3.2s
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.632075, total=   3.2s
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.632075, total=   3.1s
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.632075, total=   3.1s
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.632075, total=   3.2s
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] . kernel=sigmoid, C=0.1, gamma=0.1, score=0.632075, total=   3.2s
[CV] kernel=rbf, C=1.0, gamma=0.01 ...................................
[CV] .

[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:  2.7min


[CV] .. kernel=rbf, C=1000.0, gamma=0.1, score=0.773585, total=   3.3s
[CV] kernel=sigmoid, C=1000.0, gamma=0.1 .............................
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.620370, total=   3.5s
[CV] kernel=rbf, C=10000.0, gamma=0.01 ...............................
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.698113, total=   3.5s
[CV] kernel=rbf, C=10000.0, gamma=0.01 ...............................
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.703704, total=   3.6s
[CV] kernel=rbf, C=10000.0, gamma=0.01 ...............................
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.584906, total=   3.4s
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.603774, total=   3.4s
[CV] kernel=rbf, C=10000.0, gamma=0.01 ...............................
[CV] kernel=rbf, C=10000.0, gamma=0.01 ...............................
[CV]  kernel=sigmoid, C=1000.0, gamma=0.1, score=0.716981, total=   3.4s
[CV] kernel=rbf, C=10000.0, gamma=0.01 ..........................

[Parallel(n_jobs=10)]: Done 400 out of 400 | elapsed:  4.0min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'kernel': ['rbf', 'sigmoid'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04]), 'gamma': array([ 0.01,  0.1 ])}, {'kernel': ['linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [13]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.10000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.723684210526
{'kernel': 'rbf', 'C': 10.0, 'gamma': 0.10000000000000001}


In [18]:
model = svm.SVC(kernel='rbf',C=10,gamma=0.1)

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.794915849316
0.723602375961
0.848266022827


In [14]:
nb_params = { 'alpha': np.logspace(-3, 3, 7) }

grid_search = GridSearchCV(MultinomialNB(), nb_params, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.759259, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.688679, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.712963, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.726415, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.660377, total=   0.0s
[CV] ...........

[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    0.6s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [15]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
0.718984962406
{'alpha': 0.001}


In [16]:
from sklearn.naive_bayes import BernoulliNB

nb_params = { 'alpha': np.logspace(-3, 3, 7) }

grid_search = GridSearchCV(BernoulliNB(), nb_params, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.773585, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................... alpha=0.001, score=0.641509, total=   0.0s
[CV] ...................... alpha=0.001, score=0.740741, total=   0.1s
[CV] ...........

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.3s


[CV] alpha=0.1 .......................................................
[CV] ........................ alpha=0.1, score=0.773585, total=   0.0s
[CV] ........................ alpha=1.0, score=0.740741, total=   0.0s
[CV] ........................ alpha=0.1, score=0.783019, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] ........................ alpha=1.0, score=0.722222, total=   0.0s
[CV] ........................ alpha=0.1, score=0.707547, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] ........................ alpha=1.0, score=0.707547, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] ........................ alpha=1.0, score=0.603774, total=   0.0s
[CV] alpha=1.0 .......................................................
[CV] a

[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    0.9s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [17]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

BernoulliNB(alpha=0.10000000000000001, binarize=0.0, class_prior=None,
      fit_prior=True)
0.701127819549
{'alpha': 0.10000000000000001}
