In [11]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [14]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,2)) 
          + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

500: f1(0.8), acc(0.7), recall(0.94)
600: f1(0.81), acc(0.71), recall(0.94)
700: f1(0.8), acc(0.71), recall(0.94)
800: f1(0.8), acc(0.71), recall(0.94)
900: f1(0.81), acc(0.72), recall(0.94)
1000: f1(0.81), acc(0.72), recall(0.93)
1100: f1(0.81), acc(0.72), recall(0.93)
1200: f1(0.81), acc(0.72), recall(0.93)
1300: f1(0.81), acc(0.72), recall(0.94)
1400: f1(0.81), acc(0.72), recall(0.94)
1500: f1(0.81), acc(0.72), recall(0.94)
1600: f1(0.82), acc(0.73), recall(0.95)
1700: f1(0.82), acc(0.73), recall(0.95)
1800: f1(0.81), acc(0.72), recall(0.95)
1900: f1(0.81), acc(0.72), recall(0.95)
2000: f1(0.81), acc(0.72), recall(0.95)
2100: f1(0.81), acc(0.72), recall(0.96)
2200: f1(0.81), acc(0.72), recall(0.96)
2300: f1(0.81), acc(0.72), recall(0.96)
2400: f1(0.81), acc(0.72), recall(0.96)
2500: f1(0.81), acc(0.72), recall(0.96)
2600: f1(0.81), acc(0.72), recall(0.96)
2700: f1(0.81), acc(0.71), recall(0.96)
2800: f1(0.81), acc(0.72), recall(0.97)
2900: f1(0.81), acc(0.71), recall(0.97)
3000: f1(

In [12]:
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,1), strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(1000) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1000: f1(0.8), acc(0.7), recall(0.94)


In [13]:
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 100
top_features = [features[i] for i in indices[:top_n]]
print top_features

[u'aprendinoenem', u'you', u'clique', u'natal', u'empresa', u'leite', u'produto', u'velho', u'cristo', u'casamento', u'cabelos', u'produtos', u'facebook', u'cores', u'leitura', u'jogo', u'estilo', u'loja', u'pele', u'personagem', u'criancas', u'autor', u'boca', u'festa', u'viagem', u'obrigada', u'linha', u'sonho', u'base', u'ceu', u'jesus', u'musicas', u'igreja', u'morte', u'comeca', u'projeto', u'rua', u'esperanca', u'detalhes', u'mulheres', u'11', u'papel', u'rio', u'estado', u'cabelo', u'internet', u'sol', u'rosto', u'carro', u'quarto', u'pontos', u'felicidade', u'crianca', u'alma', u'www', u'fazia', u'uso', u'fe', u'paz', u'sucesso', u'continuar', u'real', u'terra', u'alto', u'personagens', u'paulo', u'rs', u'leve', u'livros', u'pegar', u'sentimentos', u'amar', u'vivo', u'http', u'relacao', u'questao', u'umas', u'naquele', u'unico', u'tamanho', u'feira', u'problemas', u'grupo', u'ontem', u'alegria', u'ultima', u'passo', u'triste', u'visto', u'30', u'criar', u'faca', u'desejo', u'pi