In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
#corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(1000, 23)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [4]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [5]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')') 

500: f1(0.7935), acc(0.6894), precision(0.6862), recall(0.9419)
600: f1(0.796), acc(0.6935), precision(0.69), recall(0.9419)
700: f1(0.801), acc(0.7024), precision(0.6976), recall(0.9418)
800: f1(0.7975), acc(0.6974), precision(0.6947), recall(0.9371)
900: f1(0.8005), acc(0.7024), precision(0.6975), recall(0.9403)
1000: f1(0.8012), acc(0.7034), precision(0.6987), recall(0.9403)
1100: f1(0.7987), acc(0.7004), precision(0.6964), recall(0.9372)
1200: f1(0.7985), acc(0.7004), precision(0.6972), recall(0.9356)
1300: f1(0.8006), acc(0.7025), precision(0.6977), recall(0.9403)
1400: f1(0.8034), acc(0.7065), precision(0.7005), recall(0.9435)
1500: f1(0.803), acc(0.7055), precision(0.6989), recall(0.945)
1600: f1(0.8068), acc(0.7105), precision(0.7012), recall(0.9513)
1700: f1(0.8074), acc(0.7104), precision(0.7003), recall(0.9545)
1800: f1(0.8064), acc(0.7074), precision(0.6962), recall(0.9592)
1900: f1(0.8026), acc(0.7004), precision(0.6905), recall(0.9592)
2000: f1(0.8069), acc(0.7074), preci

In [11]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

1600: f1(0.81), acc(0.71), recall(0.95)


In [12]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.38445320537 cu
outro -8.38445320537 fila
outro -8.38445320537 podcast
outro -8.27412197174 delicia
outro -8.26178137935 arroz
outro -8.24618722404 estavamos
outro -8.23383076293 barra
outro -8.22442789639 pau
outro -8.21081773286 rsrs
outro -8.19486017784 passe
outro -8.19053972054 paris
outro -8.1895357691 experiencias
outro -8.18679348584 gostoso
outro -8.18541359566 mandou
outro -8.1543390274 jb

diario -4.77160886057 nao
diario -5.40434529263 voce
diario -5.59401028028 deus
diario -5.79692213857 ser
diario -5.90947887938 vida
diario -5.9098196713 pra
diario -5.94173508893 sao
diario -5.97898827289 ja
diario -5.99335615051 sobre
diario -6.04260040949 tambem
diario -6.04761185269 bem
diario -6.07067402539 tudo
diario -6.08741312006 aqui
diario -6.09325653826 so
diario -6.14125458545 dia


In [13]:
vectorizer = TfidfVectorizer(max_features=1600, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'f1(' + str(round(f1,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: f1(0.81), acc(0.72), recall(0.95)


In [14]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.03938211904 cu
outro -8.03938211904 fila
outro -8.03938211904 podcast
outro -7.87755730878 arroz
outro -7.84285834641 barra
outro -7.83346441799 pau
outro -7.82218850845 rsrs
outro -7.82194431472 estavamos
outro -7.81760556428 experiencias
outro -7.81403395644 mandou
outro -7.7983243095 delicia
outro -7.77246093595 paris
outro -7.75845342645 passe
outro -7.74309932011 saiba
outro -7.73541633094 gostoso

diario -4.14095885698 nao
diario -4.79233326742 voce
diario -5.03745839088 deus
diario -5.1606207569 ser
diario -5.27084299377 pra
diario -5.31173627995 sao
diario -5.31776964024 vida
diario -5.36145346127 ja
diario -5.38242925381 sobre
diario -5.41668447114 tambem
diario -5.43509443726 bem
diario -5.46978656757 so
diario -5.4701320931 aqui
diario -5.47147489154 tudo
diario -5.5278746145 ate


In [15]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')