In [10]:
import pandas as pd
import numpy as np

In [11]:
# read file into pandas using a relative path
path = 'data/noticias.csv'
news = pd.read_csv(path, delimiter=' ', quotechar='|', names=['index', 'message', 'risk', 'category'])

In [12]:
# examine the first 10 rows
news.head(10)

Unnamed: 0,index,message,risk,category
0,0,Mas quem dirá se será pacto democrático ou lut...,Medio,Manifestacao
1,1,"Assinado pela presidente nacional do partido, ...",Medio,Politica
2,2,PolíticaO deputado federal Vicente Cândido (PT...,Alto,Corrupcao
3,3,Trata-se do célebre caso do apartamento tríple...,Medio,Processo-Juridico
4,4,Pesquisa divulgada na semana passada pelo site...,Baixo,Eleicoes
5,5,Uma hipotética condenação de Lula teria como ú...,Medio,Politica
6,6,O que nós queremos é que não inviabilizem o no...,Medio,Manifestacao
7,7,"""Eu estou quase falando: 'Moro, meu amigo Moro...",Alto,Crime
8,8,"Repetiu que, ""se for necessário, será candidat...",Alto,Processo-Juridico
9,9,O MPF pedirá o aumento da pena de Palocci e do...,Alto,Corrupcao


In [13]:
# convert label to a numerical variable
news['category'] = news.category.map({'Positiva':6, 'Corrupcao':2, 'Processo-Juridico':2, 'Crime':2, 'Economia':3, 'Manifestacao':1, 'Eleicoes':4, 'Dano-Ambiental':5, 'Politica':4})

In [14]:
news.head(10)

Unnamed: 0,index,message,risk,category
0,0,Mas quem dirá se será pacto democrático ou lut...,Medio,1
1,1,"Assinado pela presidente nacional do partido, ...",Medio,4
2,2,PolíticaO deputado federal Vicente Cândido (PT...,Alto,2
3,3,Trata-se do célebre caso do apartamento tríple...,Medio,2
4,4,Pesquisa divulgada na semana passada pelo site...,Baixo,4
5,5,Uma hipotética condenação de Lula teria como ú...,Medio,4
6,6,O que nós queremos é que não inviabilizem o no...,Medio,1
7,7,"""Eu estou quase falando: 'Moro, meu amigo Moro...",Alto,2
8,8,"Repetiu que, ""se for necessário, será candidat...",Alto,2
9,9,O MPF pedirá o aumento da pena de Palocci e do...,Alto,2


In [15]:
# how to define X and y (from the NEWS data) for use with COUNTVECTORIZER
X = news.message
y = news.category
print(X.shape)
print(y.shape)

(434,)
(434,)


In [16]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(325,)
(109,)
(325,)
(109,)


In [17]:
# import and instantiate CountVectorizer (with the default parameters)

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=2)

In [31]:
# equivalently: combine fit and transform into a single step

vect.fit(X_train)
from sklearn.externals import joblib
joblib.dump(vect, "cat_voc.cls")
X_train_dtm = vect.transform(X_train)

# prepare test data

X_test_dtm = vect.transform(X_test)

## Machine Learning

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import OrthogonalMatchingPursuit

## Gaussian

In [20]:
gauss = GaussianNB()
%time gauss.fit(X_train_dtm.toarray(), y_train)
y_pred_gauss = gauss.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_gauss)

CPU times: user 40 ms, sys: 8 ms, total: 48 ms
Wall time: 45 ms


0.64220183486238536

## KNeighborsClassifier

In [21]:
KNC = KNeighborsClassifier(n_neighbors=9)
%time KNC.fit(X_train_dtm.toarray(), y_train)
y_pred_KNC = KNC.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_KNC)

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 16.8 ms


0.56880733944954132

## Random Forest

In [22]:
randForest = RandomForestClassifier(n_estimators=500, n_jobs=5)
%time randForest.fit(X_train_dtm, y_train)
y_pred_randForest = randForest.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_randForest)

CPU times: user 2.47 s, sys: 60 ms, total: 2.53 s
Wall time: 1.6 s


0.69724770642201839

## Decision Tree

In [23]:
dt = DecisionTreeClassifier()
%time dt.fit(X_train_dtm.toarray(), y_train)
y_pred_dt = dt.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_dt)

CPU times: user 64 ms, sys: 4 ms, total: 68 ms
Wall time: 66.6 ms


0.54128440366972475

## gxBoost

In [24]:
GBC = GradientBoostingClassifier()
%time GBC.fit(X_train_dtm.toarray(), y_train)
y_pred_GBC = GBC.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_GBC)

CPU times: user 10.3 s, sys: 0 ns, total: 10.3 s
Wall time: 10.3 s


0.61467889908256879

## Linear SVC

In [25]:
svc = LinearSVC(C=0.9, max_iter=500)
%time svc.fit(X_train_dtm.toarray(), y_train)
y_pred_svc = svc.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_svc)

CPU times: user 100 ms, sys: 0 ns, total: 100 ms
Wall time: 97.8 ms


0.59633027522935778

## SGD Classifier

In [26]:
clf = linear_model.SGDClassifier()
clf.fit(X_train_dtm, y_train)
y_pred_clf = clf.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_clf)

0.66055045871559637

## Orthogonal Matching Pursuit

In [27]:
ort = OrthogonalMatchingPursuit()
%time ort.fit(X_train_dtm.toarray(), y_train)
y_pred_ort = ort.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_ort.round().clip(1,3))

CPU times: user 224 ms, sys: 0 ns, total: 224 ms
Wall time: 112 ms


0.28440366972477066

## Logistic Regression

In [28]:
lr = LogisticRegression(C=0.23)
lrCV = LogisticRegressionCV()
%time lr.fit(X_train_dtm.toarray(), y_train)
y_pred_lr = lr.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_lr)
#%time lrCV.fit(X_train_dtm.toarray(), y_train)
#y_pred_lrCV = lrCV.predict(X_test_dtm.toarray())
#metrics.accuracy_score(y_test, y_pred_lrCV)

CPU times: user 52 ms, sys: 0 ns, total: 52 ms
Wall time: 26.5 ms


0.65137614678899081

In [29]:
from sklearn.externals import joblib
joblib.dump(randForest, "category_classifier.cls")

['category_classifier.cls']