In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
n_features = 1600

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, strip_accents='unicode',
                                   max_features=n_features,
                                   stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(corpus.content)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, strip_accents='unicode',
                                max_features=n_features,
                                stop_words=stopwords)
tf = tf_vectorizer.fit_transform(corpus.content)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

In [4]:
model = MultinomialNB()

for n_topics in range(10,60,10):
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    data = lda.fit_transform(tf)

    f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
    
    print(str(n_topics) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

10: f1(0.776), acc(0.634), precision(0.634), recall(1.0)
20: f1(0.7766), acc(0.637), precision(0.6367), recall(0.9953)
30: f1(0.7713), acc(0.63), precision(0.6342), recall(0.9843)
40: f1(0.7779), acc(0.6421), precision(0.6411), recall(0.989)
50: f1(0.7757), acc(0.635), precision(0.6355), recall(0.9953)


In [5]:
from sklearn import svm

model = svm.LinearSVC()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.649376701579
0.641141714171
0.943452380952


In [6]:
c_range = np.logspace(-3,3,7)
param_grid = [
    {'kernel': ['rbf','linear'], 'C': c_range},
]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.633663, total=   0.2s
[CV] .............. C=0.001, kernel=rbf, score=0.633663, total=   0.2s
[CV] C=0.001, kernel=linear ..........................................
[CV] C=0.001, 

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.8s


[CV] C=0.01, kernel=rbf ..............................................
[CV] ........... C=0.001, kernel=linear, score=0.636364, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............... C=0.01, kernel=rbf, score=0.633663, total=   0.3s
[CV] ............... C=0.01, kernel=rbf, score=0.633663, total=   0.3s
[CV] C=0.01, kernel=linear ...........................................
[CV] C=0.01, kernel=linear ...........................................
[CV] ............... C=0.01, kernel=rbf, score=0.633663, total=   0.3s
[CV] ............... C=0.01, kernel=rbf, score=0.636364, total=   0.2s
[CV] ............... C=0.01, kernel=rbf, score=0.630000, total=   0.2s
[CV] C=0.01, kernel=linear ...........................................
[CV] ............... C=0.01, kernel=rbf, score=0.636364, total=   0.2s
[CV] ............... C=0.01, kernel=rbf, score=0.636364, total=   0.2s
[CV] ............... C=0.01, kernel=rbf, score=0.630000, total=   0.2s
[CV] C

[CV] C=10.0, kernel=rbf ..............................................
[CV] ............. C=1.0, kernel=linear, score=0.646465, total=   0.2s
[CV] ............. C=1.0, kernel=linear, score=0.636364, total=   0.2s
[CV] C=10.0, kernel=rbf ..............................................
[CV] C=10.0, kernel=rbf ..............................................
[CV] C=10.0, kernel=rbf ..............................................
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............... C=10.0, kernel=rbf, score=0.623762, total=   0.3s
[CV] ............... C=10.0, kernel=rbf, score=0.633663, total=   0.3s
[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=linear ...........................................
[CV] ............... C=10.0, kernel=rbf, score=0.633663, total=   0.3s
[CV] ............... C=10.0, kernel=rbf, score=0.636364, total=   0.2s
[CV] ............... C=10.0, kernel=rbf, score=0.633663, total=   0.2s
[CV] C

[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    5.3s


[CV] ........... C=100.0, kernel=linear, score=0.653465, total=   0.4s
[CV] C=1000.0, kernel=rbf ............................................
[CV] ........... C=100.0, kernel=linear, score=0.643564, total=   0.4s
[CV] ........... C=100.0, kernel=linear, score=0.653465, total=   0.3s
[CV] C=1000.0, kernel=rbf ............................................
[CV] C=1000.0, kernel=rbf ............................................
[CV] ........... C=100.0, kernel=linear, score=0.633663, total=   0.4s
[CV] ........... C=100.0, kernel=linear, score=0.676768, total=   0.3s
[CV] C=1000.0, kernel=rbf ............................................
[CV] C=1000.0, kernel=rbf ............................................
[CV] ........... C=100.0, kernel=linear, score=0.616162, total=   0.3s
[CV] ........... C=100.0, kernel=linear, score=0.640000, total=   0.3s
[CV] ........... C=100.0, kernel=linear, score=0.656566, total=   0.2s
[CV] C=1000.0, kernel=rbf ............................................
[CV] C

[Parallel(n_jobs=10)]: Done 140 out of 140 | elapsed:   13.8s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'kernel': ['rbf', 'linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.647
{'C': 100.0, 'kernel': 'linear'}
