In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')

In [5]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [19]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,2)) 
          + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

500: f1(0.8), acc(0.7), recall(0.96)
600: f1(0.8), acc(0.7), recall(0.96)
700: f1(0.8), acc(0.7), recall(0.95)
800: f1(0.81), acc(0.71), recall(0.95)
900: f1(0.8), acc(0.71), recall(0.95)
1000: f1(0.81), acc(0.72), recall(0.95)
1100: f1(0.81), acc(0.71), recall(0.96)
1200: f1(0.81), acc(0.71), recall(0.96)
1300: f1(0.81), acc(0.71), recall(0.96)
1400: f1(0.81), acc(0.71), recall(0.96)
1500: f1(0.81), acc(0.71), recall(0.96)
1600: f1(0.81), acc(0.71), recall(0.97)
1700: f1(0.81), acc(0.71), recall(0.96)
1800: f1(0.81), acc(0.71), recall(0.97)
1900: f1(0.81), acc(0.71), recall(0.97)
2000: f1(0.81), acc(0.71), recall(0.98)
2100: f1(0.81), acc(0.71), recall(0.98)
2200: f1(0.81), acc(0.7), recall(0.98)
2300: f1(0.81), acc(0.7), recall(0.98)
2400: f1(0.81), acc(0.71), recall(0.98)
2500: f1(0.81), acc(0.7), recall(0.98)
2600: f1(0.81), acc(0.7), recall(0.98)
2700: f1(0.81), acc(0.7), recall(0.98)
2800: f1(0.81), acc(0.7), recall(0.99)
2900: f1(0.81), acc(0.7), recall(0.99)
3000: f1(0.81), acc

In [36]:
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,1), strip_accents='unicode')
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(1000) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1000: f1(0.8), acc(0.69), recall(0.96)


In [37]:
vectorizer.get_feature_names() #important terms?

[u'10',
 u'12',
 u'15',
 u'abaixo',
 u'achei',
 u'acho',
 u'acredito',
 u'afinal',
 u'agora',
 u'agua',
 u'ah',
 u'ai',
 u'ainda',
 u'ajuda',
 u'ajudar',
 u'alem',
 u'algo',
 u'alguem',
 u'algum',
 u'alguma',
 u'algumas',
 u'alguns',
 u'ali',
 u'amar',
 u'amiga',
 u'amigo',
 u'amigos',
 u'amo',
 u'amor',
 u'ano',
 u'anos',
 u'antes',
 u'ao',
 u'aos',
 u'apenas',
 u'apesar',
 u'apos',
 u'aquela',
 u'aquele',
 u'aqueles',
 u'aqui',
 u'aquilo',
 u'as',
 u'assim',
 u'assunto',
 u'ate',
 u'atencao',
 u'atras',
 u'atraves',
 u'base',
 u'bastante',
 u'bem',
 u'blog',
 u'boa',
 u'bom',
 u'brasil',
 u'cabeca',
 u'cabelo',
 u'cabelos',
 u'cada',
 u'caminho',
 u'cara',
 u'casa',
 u'caso',
 u'causa',
 u'certa',
 u'certeza',
 u'certo',
 u'chegar',
 u'chegou',
 u'cidade',
 u'cima',
 u'claro',
 u'coisa',
 u'coisas',
 u'colocar',
 u'com',
 u'comeca',
 u'comecar',
 u'comecei',
 u'comecou',
 u'comigo',
 u'como',
 u'comprar',
 u'conhecer',
 u'consigo',
 u'conta',
 u'contar',
 u'contra',
 u'cor',
 u'corac