In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus[corpus['_trusted_judgments'] == 3]
corpus = corpus.reset_index()
corpus.shape

(496, 28)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [4]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,2)) 
          + '), acc(' + str(round(acc,2)) 
          + '), precision(' + str(round(precision,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

500: f1(0.83), acc(0.74), precision(0.73), recall(0.96)
600: f1(0.83), acc(0.74), precision(0.73), recall(0.97)
700: f1(0.83), acc(0.74), precision(0.73), recall(0.97)
800: f1(0.84), acc(0.75), precision(0.73), recall(0.97)
900: f1(0.83), acc(0.74), precision(0.73), recall(0.97)
1000: f1(0.84), acc(0.75), precision(0.73), recall(0.98)
1100: f1(0.84), acc(0.75), precision(0.73), recall(0.98)
1200: f1(0.84), acc(0.75), precision(0.73), recall(0.98)
1300: f1(0.83), acc(0.74), precision(0.73), recall(0.98)
1400: f1(0.83), acc(0.74), precision(0.73), recall(0.98)
1500: f1(0.83), acc(0.74), precision(0.72), recall(0.98)
1600: f1(0.83), acc(0.74), precision(0.72), recall(0.99)
1700: f1(0.83), acc(0.73), precision(0.72), recall(0.99)
1800: f1(0.83), acc(0.74), precision(0.72), recall(0.99)
1900: f1(0.83), acc(0.73), precision(0.71), recall(0.98)
2000: f1(0.83), acc(0.73), precision(0.71), recall(0.99)
2100: f1(0.83), acc(0.73), precision(0.71), recall(0.99)
2200: f1(0.82), acc(0.72), precision

In [10]:
vectorizer = TfidfVectorizer(max_features=1100, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1100: f1(0.84), acc(0.75), recall(0.98)


In [11]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.74422554188 argila
outro -7.74422554188 chorei
outro -7.72677609178 conversando
outro -7.72176894178 look
outro -7.7194942341 vc
outro -7.70313391712 decidi
outro -7.68997116739 aulas
outro -7.68717598913 estavamos
outro -7.68274339436 voltei
outro -7.67484173552 antiga
outro -7.67255919657 confesso
outro -7.67072366437 pq
outro -7.662440396 roupa
outro -7.65687078523 bolsa
outro -7.6444918728 manga

diario -4.81661701098 nao
diario -5.38366401185 voce
diario -5.38778789457 deus
diario -5.74824233259 ser
diario -5.80614988863 vida
diario -5.83152803134 senhor
diario -5.84590104802 sao
diario -5.93211670774 dia
diario -5.94944110256 sobre
diario -6.00312170711 tambem
diario -6.00822310276 pra
diario -6.01884216113 ja
diario -6.03747837896 jesus
diario -6.05110053498 vai
diario -6.05155231736 bem


In [12]:
vectorizer = TfidfVectorizer(max_features=1100, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: f1(0.84), acc(0.76), recall(0.97)


In [13]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.54447756112 argila
outro -7.54447756112 chorei
outro -7.51989601982 conversando
outro -7.51942717661 vc
outro -7.51509042099 look
outro -7.5011637049 decidi
outro -7.48683537044 aulas
outro -7.47357190688 voltei
outro -7.47021031394 estavamos
outro -7.47005032276 pq
outro -7.45595183547 antiga
outro -7.45474685532 confesso
outro -7.4502548188 bolsa
outro -7.44049125571 manga
outro -7.42998615274 roupa

diario -4.49473800204 nao
diario -5.0522965882 voce
diario -5.12066520556 deus
diario -5.44148558974 ser
diario -5.50255986074 vida
diario -5.53508327053 sao
diario -5.54459172381 senhor
diario -5.60108253307 dia
diario -5.63829309562 sobre
diario -5.68715235505 tambem
diario -5.68911175799 pra
diario -5.70453103937 ja
diario -5.71444744759 vai
diario -5.75375333114 bem
diario -5.76293665763 pode


In [14]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')