In [22]:
import pandas as pd
import numpy as np


In [23]:
# read file into pandas using a relative path
path = 'data/noticias.csv'
news = pd.read_csv(path, delimiter=' ', quotechar='|', names=['index', 'message', 'risk', 'category'])

In [24]:
# examine the first 10 rows
news.head(10)

Unnamed: 0,index,message,risk,category
0,0,Mas quem dirá se será pacto democrático ou lut...,Medio,Manifestacao
1,1,"Assinado pela presidente nacional do partido, ...",Medio,Politica
2,2,PolíticaO deputado federal Vicente Cândido (PT...,Alto,Corrupcao
3,3,Trata-se do célebre caso do apartamento tríple...,Medio,Processo-Juridico
4,4,Pesquisa divulgada na semana passada pelo site...,Baixo,Eleicoes
5,5,Uma hipotética condenação de Lula teria como ú...,Medio,Politica
6,6,O que nós queremos é que não inviabilizem o no...,Medio,Manifestacao
7,7,"""Eu estou quase falando: 'Moro, meu amigo Moro...",Alto,Crime
8,8,"Repetiu que, ""se for necessário, será candidat...",Alto,Processo-Juridico
9,9,O MPF pedirá o aumento da pena de Palocci e do...,Alto,Corrupcao


In [25]:
# convert label to a numerical variable
news['risk'] = news.risk.map({'Baixo':1, 'Medio':2, 'Alto':3})

In [26]:
news.head()

Unnamed: 0,index,message,risk,category
0,0,Mas quem dirá se será pacto democrático ou lut...,2,Manifestacao
1,1,"Assinado pela presidente nacional do partido, ...",2,Politica
2,2,PolíticaO deputado federal Vicente Cândido (PT...,3,Corrupcao
3,3,Trata-se do célebre caso do apartamento tríple...,2,Processo-Juridico
4,4,Pesquisa divulgada na semana passada pelo site...,1,Eleicoes


## Separate train and test data 

In [27]:
# how to define X and y (from the NEWS data) for use with COUNTVECTORIZER
X = news.message
y = news.risk
print(X.shape)
print(y.shape)

(434,)
(434,)


In [28]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(325,)
(109,)
(325,)
(109,)


##  Init Vector

In [44]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

vect = CountVectorizer(min_df=2)

vectorizer = TfidfVectorizer(max_features=2500,
                                 min_df=2, use_idf=True)


In [45]:
# equivalently: combine fit and transform into a single step

vect.fit(X_train)
from sklearn.externals import joblib
joblib.dump(vect, "risk_voc.cls")
X_train_dtm = vect.transform(X_train)

# prepare test data

X_test_dtm = vect.transform(X_test)

## Machine Learning Algorithms 

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import OrthogonalMatchingPursuit

## Gaussian

In [32]:
gauss = GaussianNB()
%time gauss.fit(X_train_dtm.toarray(), y_train)
y_pred_gauss = gauss.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_gauss)

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 33.6 ms


0.65137614678899081

## KNeghborsClassifier

In [33]:
KNC = KNeighborsClassifier(n_neighbors=100)
%time KNC.fit(X_train_dtm.toarray(), y_train)
y_pred_KNC = KNC.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_KNC)

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 16.6 ms


0.55045871559633031

## Random Forest

In [34]:
randForest = RandomForestClassifier(n_estimators=87, n_jobs=300)
%time randForest.fit(X_train_dtm, y_train)
y_pred_randForest = randForest.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_randForest)

CPU times: user 500 ms, sys: 84 ms, total: 584 ms
Wall time: 493 ms


0.66972477064220182

## Decision Tree

In [35]:
dt = DecisionTreeClassifier()
%time dt.fit(X_train_dtm.toarray(), y_train)
y_pred_dt = dt.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_dt)

CPU times: user 44 ms, sys: 4 ms, total: 48 ms
Wall time: 50.7 ms


0.55045871559633031

## Gradient Boosting Classifier

In [36]:
GBC = GradientBoostingClassifier()
%time GBC.fit(X_train_dtm.toarray(), y_train)
y_pred_GBC = GBC.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_GBC)

CPU times: user 4.73 s, sys: 0 ns, total: 4.73 s
Wall time: 4.73 s


0.68807339449541283

## Linear SVC

In [37]:
svc = LinearSVC(C=5e-1)
%time svc.fit(X_train_dtm.toarray(), y_train)
y_pred_svc = svc.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_svc)

CPU times: user 116 ms, sys: 0 ns, total: 116 ms
Wall time: 115 ms


0.74311926605504586

## SGD Classifier

In [38]:
clf = linear_model.SGDClassifier()
clf.fit(X_train_dtm, y_train)
y_pred_clf = clf.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_clf)

0.69724770642201839

## Orthogonal Matching Pursuit

In [39]:
ort = OrthogonalMatchingPursuit()
%time ort.fit(X_train_dtm.toarray(), y_train)
y_pred_ort = ort.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_ort.round().clip(1,3))

CPU times: user 248 ms, sys: 0 ns, total: 248 ms
Wall time: 124 ms


0.55045871559633031

## LogisticRegression

In [40]:
lr = LogisticRegression(C=20)
lrCV = LogisticRegressionCV()
%time lr.fit(X_train_dtm.toarray(), y_train)
y_pred_lr = lr.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_lr)
#%time lrCV.fit(X_train_dtm.toarray(), y_train)
#y_pred_lrCV = lrCV.predict(X_test_dtm.toarray())
#metrics.accuracy_score(y_test, y_pred_lrCV)

CPU times: user 56 ms, sys: 4 ms, total: 60 ms
Wall time: 30.8 ms


0.76146788990825687

In [41]:
from sklearn.externals import joblib
joblib.dump(lr, "risk_classifier.cls")

['risk_classifier.cls']

In [42]:
model = joblib.load("risk_classifier.cls")
y_pred_model = model.predict(X_test_dtm.toarray())
metrics.accuracy_score(y_test, y_pred_model)

0.76146788990825687