In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
stopwords = stopwords.words("portuguese")

In [2]:
n_features = 1600

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, strip_accents='unicode',
                                   max_features=n_features,
                                   stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(corpus.content)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, strip_accents='unicode',
                                max_features=n_features,
                                stop_words=stopwords)
tf = tf_vectorizer.fit_transform(corpus.content)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

In [4]:
model = MultinomialNB()

for n_topics in range(10,60,10):
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    data = lda.fit_transform(tf)

    f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
    
    print(str(n_topics) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

10: f1(0.8094), acc(0.7004), precision(0.6903), recall(0.98)
20: f1(0.8353), acc(0.7548), precision(0.7425), recall(0.9566)
30: f1(0.835), acc(0.7566), precision(0.747), recall(0.9482)
40: f1(0.7844), acc(0.6481), precision(0.6502), recall(0.9886)
50: f1(0.8103), acc(0.7022), precision(0.691), recall(0.9798)


In [5]:
model = svm.LinearSVC()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.722667577845
0.730255872709
0.951092436975


In [6]:
c_range = np.logspace(-3,3,7)
param_grid = [
    {'kernel': ['rbf','linear'], 'C': c_range},
]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] C=0.001, 

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.3s


[CV] ............... C=0.01, kernel=rbf, score=0.641509, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............... C=0.01, kernel=rbf, score=0.648148, total=   0.1s
[CV] ............... C=0.01, kernel=rbf, score=0.648148, total=   0.1s
[CV] ............ C=0.01, kernel=linear, score=0.641509, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............ C=0.01, kernel=linear, score=0.648148, total=   0.0s
[CV] ............ C=0.01, kernel=linear, score=0.648148, total=   0.0s
[CV] C=0.01, kernel=linear ...........................................
[CV] ............... C=0.01, kernel=rbf, score=0.653846, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............ C=0.01, kernel=linear, score=0.648148, total=   0.0s
[CV] C=0.01, kernel=linear ...........................................
[CV] C

[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=linear ...........................................
[CV] ............ C=10.0, kernel=linear, score=0.685185, total=   0.0s
[CV] ............... C=10.0, kernel=rbf, score=0.722222, total=   0.1s
[CV] ............... C=10.0, kernel=rbf, score=0.722222, total=   0.1s
[CV] ............... C=10.0, kernel=rbf, score=0.735849, total=   0.1s
[CV] C=10.0, kernel=linear ...........................................
[CV] C=10.0, kernel=rbf ..............................................
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............ C=10.0, kernel=linear, score=0.735849, total=   0.0s
[CV] ............ C=10.0, kernel=linear, score=0.740741, total=   0.0s
[CV] .

[Parallel(n_jobs=10)]: Done 140 out of 140 | elapsed:    2.3s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'kernel': ['rbf', 'linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.734082397004
{'C': 100.0, 'kernel': 'linear'}


## Labels

In [8]:
tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stopwords)
data = tfidf_vectorizer.fit_transform(corpus['labels'])

In [9]:
model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.664332695342
0.665087620276
0.976974789916


In [10]:
model = svm.LinearSVC()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.668333100874
0.649990592915
0.913193277311


## Titles

In [11]:
tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stopwords)
data = tfidf_vectorizer.fit_transform(corpus['title'])

In [12]:
model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.649580447176
0.646036929527
0.985462184874


In [13]:
model = svm.LinearSVC()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.643274977858
0.613986991345
0.907226890756
