In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['_trusted_judgments'] == 3]
corpus = corpus.reset_index()
corpus.shape

(916, 28)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [4]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,2)) 
          + '), acc(' + str(round(acc,2)) 
          + '), precision(' + str(round(precision,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

500: f1(0.79), acc(0.69), precision(0.68), recall(0.95)
600: f1(0.8), acc(0.69), precision(0.68), recall(0.95)
700: f1(0.79), acc(0.69), precision(0.69), recall(0.94)
800: f1(0.79), acc(0.69), precision(0.69), recall(0.94)
900: f1(0.79), acc(0.69), precision(0.69), recall(0.94)
1000: f1(0.8), acc(0.69), precision(0.69), recall(0.94)
1100: f1(0.8), acc(0.7), precision(0.69), recall(0.94)
1200: f1(0.8), acc(0.7), precision(0.69), recall(0.94)
1300: f1(0.8), acc(0.7), precision(0.7), recall(0.95)
1400: f1(0.8), acc(0.7), precision(0.69), recall(0.95)
1500: f1(0.8), acc(0.71), precision(0.69), recall(0.95)
1600: f1(0.8), acc(0.7), precision(0.69), recall(0.95)
1700: f1(0.81), acc(0.71), precision(0.69), recall(0.96)
1800: f1(0.8), acc(0.7), precision(0.69), recall(0.96)
1900: f1(0.81), acc(0.71), precision(0.69), recall(0.97)
2000: f1(0.8), acc(0.7), precision(0.69), recall(0.97)
2100: f1(0.8), acc(0.7), precision(0.69), recall(0.96)
2200: f1(0.8), acc(0.7), precision(0.69), recall(0.97)
2

In [5]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(1600) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1600: f1(0.8), acc(0.7), recall(0.95)


In [6]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.3314569455 cu
outro -8.3314569455 fila
outro -8.3314569455 podcast
outro -8.24566859883 batom
outro -8.22148513779 delicia
outro -8.20982899133 arroz
outro -8.19055682527 estavamos
outro -8.1897518293 barra
outro -8.17309011169 pau
outro -8.16502696116 farinha
outro -8.15722875092 rsrs
outro -8.14581052258 passe
outro -8.14063547612 paris
outro -8.13505403655 gostoso
outro -8.13381430931 experiencias

diario -4.79547441757 nao
diario -5.45376614409 voce
diario -5.61075264425 deus
diario -5.81147719894 ser
diario -5.90382633514 pra
diario -5.93320886633 vida
diario -5.96328795858 sao
diario -5.98791821716 ja
diario -6.0220089237 sobre
diario -6.05815878779 bem
diario -6.06380554662 tambem
diario -6.07123447412 aqui
diario -6.09793698797 tudo
diario -6.12162943247 so
diario -6.16403256526 livro


In [7]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: f1(0.8), acc(0.71), recall(0.94)


In [8]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.98182893797 cu
outro -7.98182893797 fila
outro -7.98182893797 podcast
outro -7.87646370047 batom
outro -7.82105757546 arroz
outro -7.77939172504 barra
outro -7.76418063199 estavamos
outro -7.76335818658 rsrs
outro -7.76150822999 pau
outro -7.75791720278 experiencias
outro -7.74621922094 jo
outro -7.7457922427 paris
outro -7.74431932426 delicia
outro -7.74311631012 passe
outro -7.71593987422 farinha

diario -4.15502104615 nao
diario -4.83747497164 voce
diario -5.03017690904 deus
diario -5.16257061156 ser
diario -5.27183773675 pra
diario -5.33186763052 vida
diario -5.33248771114 sao
diario -5.37117021303 ja
diario -5.41227709869 sobre
diario -5.42346262203 tambem
diario -5.4280941392 aqui
diario -5.43572795079 bem
diario -5.4892861405 tudo
diario -5.4984318563 so
diario -5.54837421244 ate


In [11]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')