In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from nltk.corpus import stopwords

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(534, 23)

In [3]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

model = MultinomialNB()

['diario' 'outro']
0    1
1   -1
Name: class, dtype: int64


In [None]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [10]:
for i in range(500,5000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')') 

500: f1(0.8419), acc(0.7639), precision(0.7477), recall(0.9637)
600: f1(0.8396), acc(0.7603), precision(0.7459), recall(0.961)
700: f1(0.8407), acc(0.7604), precision(0.7427), recall(0.9693)
800: f1(0.8458), acc(0.7695), precision(0.7507), recall(0.9694)
900: f1(0.8458), acc(0.7696), precision(0.7506), recall(0.9694)
1000: f1(0.8489), acc(0.775), precision(0.7553), recall(0.9694)
1100: f1(0.8494), acc(0.775), precision(0.7544), recall(0.9721)
1200: f1(0.8485), acc(0.7713), precision(0.748), recall(0.9806)
1300: f1(0.8474), acc(0.7695), precision(0.7464), recall(0.9806)
1400: f1(0.8488), acc(0.7713), precision(0.747), recall(0.9833)
1500: f1(0.8474), acc(0.7694), precision(0.7466), recall(0.9805)
1600: f1(0.8434), acc(0.7622), precision(0.7404), recall(0.9805)
1700: f1(0.8417), acc(0.7586), precision(0.7361), recall(0.9833)
1800: f1(0.8401), acc(0.755), precision(0.7321), recall(0.9861)
1900: f1(0.8381), acc(0.7514), precision(0.729), recall(0.9861)
2000: f1(0.8397), acc(0.755), precisi

In [17]:
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,3)) 
      + '), acc(' + str(round(acc,3)) 
          + '), recall(' + str(round(recall,3)) + ')') 

1000: precision(0.755), acc(0.775), recall(0.969)


In [18]:
feature_names = vectorizer.get_feature_names()
pd.DataFrame(feature_names).to_csv('feature_names.csv')

In [12]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.20494650955 comunidade
outro -8.20494650955 fulo
outro -8.13604642991 evangelho
outro -8.09210655049 advogado
outro -8.03779831781 fas
outro -8.03308877664 entrevista
outro -8.01446887443 pastor
outro -7.99725375844 padre
outro -7.98529151638 go
outro -7.95079808432 oracao
outro -7.94455608341 album
outro -7.90166897046 nestle
outro -7.89980028931 autor
outro -7.89176778464 numeros
outro -7.88275931406 presidente

diario -4.42094494661 nao
diario -5.06213613734 pra
diario -5.09789174652 voce
diario -5.48163994909 dia
diario -5.49191115163 ja
diario -5.49938636786 tudo
diario -5.54233116239 so
diario -5.54613759819 ser
diario -5.59639170968 ate
diario -5.6485491156 bem
diario -5.65939553461 vida
diario -5.66902291823 aqui
diario -5.69878916473 sempre
diario -5.74668010173 sao
diario -5.75646602873 la


In [13]:
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,1), 
                             strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

model.fit(data.toarray(),target)
n = 400 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

vocabulary = []

for coef, feat in topn_class1:
    if feat not in vocabulary: 
        vocabulary.append(feat)
for coef, feat in reversed(topn_class2):
    if feat not in vocabulary: 
        vocabulary.append(feat)
    
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=vocabulary)
data = vectorizer.fit_transform(corpus.content)

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()

print(str(len(vectorizer.get_feature_names())) + ': ' + 'precision(' + str(round(precision,2)) 
      + '), acc(' + str(round(acc,2)) 
          + '), recall(' + str(round(recall,2)) + ')') 

800: precision(0.75), acc(0.77), recall(0.97)


In [14]:
model.fit(data.toarray(),target)
n = 15 

class_labels = ['outro','diario']
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.10070322691 comunidade
outro -8.10070322691 fulo
outro -8.02846293865 evangelho
outro -7.97586418792 advogado
outro -7.91363912711 fas
outro -7.89800450082 pastor
outro -7.89364444599 entrevista
outro -7.8822569793 padre
outro -7.85987611475 go
outro -7.82261665925 oracao
outro -7.81482705556 album
outro -7.77486436848 autor
outro -7.76548218148 numeros
outro -7.76419758771 presidente
outro -7.70299136354 sangue

diario -4.22935883539 nao
diario -4.87026259117 pra
diario -4.9226554497 voce
diario -5.28509092831 dia
diario -5.28666217261 ja
diario -5.31020657062 tudo
diario -5.35384120852 so
diario -5.36786923933 ser
diario -5.39593473969 ate
diario -5.45554107675 bem
diario -5.47682802421 vida
diario -5.47706558337 aqui
diario -5.50857771893 sempre
diario -5.55975295292 la
diario -5.5603006546 sao


In [15]:
pd.DataFrame(feature_names).to_csv('feature_names.csv')