In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [27]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,2)) 
          + '), acc(' + str(round(acc,2)) 
          + '), precision(' + str(round(precision,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

500: f1(0.8), acc(0.7), precision(0.69), recall(0.94)
600: f1(0.81), acc(0.71), precision(0.7), recall(0.94)
700: f1(0.8), acc(0.71), precision(0.7), recall(0.94)
800: f1(0.8), acc(0.71), precision(0.7), recall(0.94)
900: f1(0.81), acc(0.72), precision(0.71), recall(0.94)
1000: f1(0.81), acc(0.72), precision(0.71), recall(0.93)
1100: f1(0.81), acc(0.72), precision(0.71), recall(0.93)
1200: f1(0.81), acc(0.72), precision(0.72), recall(0.93)
1300: f1(0.81), acc(0.72), precision(0.71), recall(0.94)
1400: f1(0.81), acc(0.72), precision(0.71), recall(0.94)
1500: f1(0.81), acc(0.72), precision(0.71), recall(0.94)
1600: f1(0.82), acc(0.73), precision(0.71), recall(0.95)
1700: f1(0.82), acc(0.73), precision(0.71), recall(0.95)
1800: f1(0.81), acc(0.72), precision(0.71), recall(0.95)
1900: f1(0.81), acc(0.72), precision(0.71), recall(0.95)
2000: f1(0.81), acc(0.72), precision(0.7), recall(0.95)
2100: f1(0.81), acc(0.72), precision(0.7), recall(0.96)
2200: f1(0.81), acc(0.72), precision(0.71), r

In [32]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(1600) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1600: f1(0.82), acc(0.73), recall(0.95)


In [33]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.42840062262 cu
outro -8.42840062262 fila
outro -8.34170450478 sentada
outro -8.31877614505 delicia
outro -8.30412835281 arroz
outro -8.28116258082 estavamos
outro -8.27818574845 barra
outro -8.26333103888 pau
outro -8.25738320847 rsrs
outro -8.23837942924 passe
outro -8.23520115028 paris
outro -8.23197234622 experiencias
outro -8.22789515148 mandou
outro -8.19827767411 procurando
outro -8.194199319 saiba

diario -4.7558889968 nao
diario -5.38686687879 voce
diario -5.60623869012 deus
diario -5.77658655193 ser
diario -5.87346286987 pra
diario -5.91141740959 vida
diario -5.93281533584 sao
diario -5.98216293634 ja
diario -5.99317315418 sobre
diario -6.02986949287 tambem
diario -6.05648081852 bem
diario -6.06118712102 tudo
diario -6.06154549924 so
diario -6.10294784233 dia
diario -6.10774054733 aqui


In [45]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: f1(0.82), acc(0.73), recall(0.94)


In [46]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.08988280427 cu
outro -8.08988280427 fila
outro -7.98351493146 sentada
outro -7.92703328594 arroz
outro -7.89059145104 barra
outro -7.88473954565 pau
outro -7.87436582665 rsrs
outro -7.86612067399 experiencias
outro -7.86033072728 mandou
outro -7.85431620812 delicia
outro -7.83669833205 paris
outro -7.83376406304 passe
outro -7.81637439594 estavamos
outro -7.80231057735 procurando
outro -7.78411372006 saiba

diario -4.13416892021 nao
diario -4.7803952574 voce
diario -5.0616782835 deus
diario -5.15135249475 ser
diario -5.24638918025 pra
diario -5.30946818547 sao
diario -5.33918782585 vida
diario -5.36797221208 ja
diario -5.396421643 sobre
diario -5.41043837627 tambem
diario -5.44594277303 so
diario -5.45216460513 bem
diario -5.47079139909 tudo
diario -5.49493630236 aqui
diario -5.50209285846 dia


In [None]:
columns=vectorizer.get_feature_names()
pdCSV = pd.DataFrame(data.toarray(),columns=columns)
pdCSV['class'] = corpus['qual_a_melhor_classificao_para_esse_texto'].values
#pdCSV.to_csv('tfidfFeatures.csv')