In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(534, 23)

In [3]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

model = MultinomialNB(alpha=0.001)

['diario' 'outro']
0    1
1   -1
Name: class, dtype: int64


In [None]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [5]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')') 

500: f1(0.8385), acc(0.7641), precision(0.7565), recall(0.9422)
600: f1(0.835), acc(0.7585), precision(0.7522), recall(0.9393)
700: f1(0.837), acc(0.764), precision(0.7593), recall(0.9334)
800: f1(0.8362), acc(0.7659), precision(0.7664), recall(0.9218)
900: f1(0.8499), acc(0.7882), precision(0.7882), recall(0.9246)
1000: f1(0.8494), acc(0.7884), precision(0.7891), recall(0.9218)
1100: f1(0.8575), acc(0.7996), precision(0.7971), recall(0.9303)
1200: f1(0.8544), acc(0.7958), precision(0.795), recall(0.9246)
1300: f1(0.855), acc(0.7977), precision(0.8003), recall(0.9189)
1400: f1(0.8506), acc(0.7904), precision(0.7926), recall(0.919)
1500: f1(0.854), acc(0.7941), precision(0.7946), recall(0.9247)
1600: f1(0.8479), acc(0.7849), precision(0.7857), recall(0.9218)
1700: f1(0.8457), acc(0.7831), precision(0.7882), recall(0.9134)
1800: f1(0.8447), acc(0.7813), precision(0.7862), recall(0.9135)
1900: f1(0.8494), acc(0.787), precision(0.788), recall(0.9222)
2000: f1(0.8426), acc(0.7794), precisio

In [17]:
vectorizer = TfidfVectorizer(max_features=1300, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,3)) 
      + '), acc(' + str(round(acc,3)) 
          + '), recall(' + str(round(recall,3)) + ')') 

1300: precision(0.8), acc(0.798), recall(0.919)


In [7]:
feature_names = vectorizer.get_feature_names()
pd.DataFrame(feature_names).to_csv('feature_names.csv')

In [12]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -14.7878978495 arthur
outro -14.7878978495 atitudes
outro -14.7878978495 comunidade
outro -14.7878978495 fulo
outro -14.7878978495 lohraine
outro -14.7878978495 now
outro -14.7878978495 plus
outro -14.7878978495 selena
outro -10.547722584 evangelho
outro -10.4039007241 mundial
outro -10.0293200155 advogado
outro -9.98360378444 mr
outro -9.77741076852 fas
outro -9.75317446918 salvacao
outro -9.71612308875 en

diario -4.21819974347 nao
diario -4.90847526335 pra
diario -4.93069608442 voce
diario -5.33995847914 ja
diario -5.34153141504 dia
diario -5.36732774429 tudo
diario -5.38342771941 ser
diario -5.40606660952 so
diario -5.46632612062 ate
diario -5.49844306499 bem
diario -5.51739591374 vida
diario -5.5354423855 aqui
diario -5.58016078776 sempre
diario -5.61213366781 sao
diario -5.62191566613 vou


In [13]:
vectorizer = TfidfVectorizer(max_features=1300, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: precision(0.8), acc(0.79), recall(0.91)


In [14]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -14.6627577022 arthur
outro -14.6627577022 atitudes
outro -14.6627577022 comunidade
outro -14.6627577022 fulo
outro -14.6627577022 lohraine
outro -14.6627577022 now
outro -14.6627577022 plus
outro -14.6627577022 selena
outro -10.0362471343 mundial
outro -10.0129388666 evangelho
outro -9.75480798621 advogado
outro -9.73384698486 mr
outro -9.50777508087 salvacao
outro -9.43977702604 en
outro -9.3167891835 entrevista

diario -3.90016723557 nao
diario -4.59419454387 pra
diario -4.6271610713 voce
diario -4.99835696309 ja
diario -5.02619837016 dia
diario -5.04501682262 tudo
diario -5.07540497046 ser
diario -5.08322100641 so
diario -5.13554177006 ate
diario -5.1701492655 bem
diario -5.2060493309 aqui
diario -5.21538525439 vida
diario -5.25538076449 sempre
diario -5.2751581612 sao
diario -5.3005574071 vou


In [11]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')