In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(534, 23)

In [3]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)

model = MultinomialNB()

In [4]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [5]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')') 

500: f1(0.8354), acc(0.7529), precision(0.7391), recall(0.9624)
600: f1(0.835), acc(0.751), precision(0.7351), recall(0.9682)
700: f1(0.8356), acc(0.753), precision(0.7376), recall(0.9653)
800: f1(0.8409), acc(0.7624), precision(0.7459), recall(0.9653)
900: f1(0.8458), acc(0.7698), precision(0.7501), recall(0.9711)
1000: f1(0.8408), acc(0.7624), precision(0.7458), recall(0.9654)
1100: f1(0.8366), acc(0.7549), precision(0.7391), recall(0.9654)
1200: f1(0.839), acc(0.7568), precision(0.736), recall(0.9771)
1300: f1(0.8373), acc(0.7531), precision(0.7314), recall(0.9799)
1400: f1(0.8439), acc(0.7643), precision(0.7397), recall(0.9828)
1500: f1(0.8397), acc(0.7567), precision(0.7334), recall(0.9828)
1600: f1(0.8329), acc(0.7456), precision(0.7264), recall(0.9771)
1700: f1(0.8316), acc(0.7417), precision(0.7214), recall(0.9828)
1800: f1(0.832), acc(0.7417), precision(0.7203), recall(0.9856)
1900: f1(0.833), acc(0.7436), precision(0.7218), recall(0.9856)
2000: f1(0.8306), acc(0.7399), precis

In [5]:
vectorizer = TfidfVectorizer(max_features=900, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

900: precision(0.75), acc(0.77), recall(0.97)


In [6]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.66660825055 vc
outro -7.66304838567 look
outro -7.64695014061 decidi
outro -7.61550917588 pq
outro -7.60347721246 roupa
outro -7.57225586371 00
outro -7.57153146072 aulas
outro -7.56967055418 mamae
outro -7.54859480393 banheiro
outro -7.53229558289 maximo
outro -7.52708949314 horario
outro -7.49453147458 estrada
outro -7.49306493376 cm
outro -7.48436758109 ficamos
outro -7.48096061286 falo

diario -4.60320035234 nao
diario -5.16502189351 voce
diario -5.26292503491 deus
diario -5.5592107611 ser
diario -5.62084727574 vida
diario -5.66646626693 sao
diario -5.6777980488 senhor
diario -5.72753092471 dia
diario -5.75305033976 sobre
diario -5.80573797248 ja
diario -5.80859557195 pra
diario -5.80895899002 tambem
diario -5.87279862689 bem
diario -5.87528752613 vai
diario -5.88612558354 jesus


In [7]:
vectorizer = TfidfVectorizer(max_features=900, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: precision(0.75), acc(0.77), recall(0.98)


In [8]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.59429862308 vc
outro -7.59001189419 look
outro -7.57368148819 decidi
outro -7.54292604045 pq
outro -7.528582848 roupa
outro -7.4981474268 00
outro -7.49679374177 aulas
outro -7.49361204544 mamae
outro -7.4694280946 banheiro
outro -7.45299712197 maximo
outro -7.44083181051 horario
outro -7.41903858078 estrada
outro -7.41282346653 cm
outro -7.40534343004 ficamos
outro -7.39444551039 loja

diario -4.48420373812 nao
diario -5.05351659125 voce
diario -5.13579392755 deus
diario -5.43694104236 ser
diario -5.50804165338 vida
diario -5.55459971551 sao
diario -5.57192447499 senhor
diario -5.59665784872 dia
diario -5.64336783483 sobre
diario -5.6880585823 ja
diario -5.69142037497 tambem
diario -5.69852604584 pra
diario -5.76337732965 vai
diario -5.76478595105 bem
diario -5.77483979999 jesus


In [9]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')