In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
corpus_feat.shape

(534, 68)

In [2]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['content']))
    except:
        return 0
    
corpus['wc'] = corpus.apply(wc,axis=1)
corpus_feat['wc'] = corpus['wc']

In [3]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)

In [4]:
# fix labels to binary
def classFit(x):
    if x['class'] == "diario":
        return 1
    else:
        return -1
    
corpus_feat['class_'] = corpus_feat.apply(classFit,axis=1)
target = corpus_feat['class_'].values

print(corpus_feat['class'].values[:2])
print(corpus_feat['class_'][:2])

['diario' 'outro']
0    1
1   -1
Name: class_, dtype: int64


In [5]:
wc_vector = corpus_feat['wc']
class_vector = corpus_feat['class']

corpus_feat.drop('class',axis=1,inplace=True)
corpus_feat.drop('class_',axis=1,inplace=True)
corpus_feat.drop('wc',axis=1,inplace=True)

In [6]:
data = corpus_feat.as_matrix().astype(float) / wc_vector.as_matrix().astype(float)[:, np.newaxis]
data[np.isnan(data)] = 0
data[data >= 1E308] = 0
data.shape

  if __name__ == '__main__':
  if __name__ == '__main__':


(534, 64)

## Evaluating SVM

In [22]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid_search = GridSearchCV(SVC(), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............... C=1, kernel=linear, score=0.636872, total=   0.0s
[CV] ............... C=1, kernel=linear, score=0.646067, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] ............... C=1, kernel=linear, score=0.649718, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.648045, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] .................. C=1, kernel=rbf, score=0.646067, total=   0.0s
[CV] .................. C=1, kernel=rbf, score=0.649718, total=   0.1s
[CV] ............

[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.4s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [23]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.75468164794
{'C': 10, 'kernel': 'linear'}


## Evaluating Naive Bayses

In [24]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.792230333256
0.691121055744
0.716302521008


In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.647960006451
0.647960006451
1.0


In [34]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll
import numpy as np

## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:
    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=1)
            
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.644194756554), prec(0.646993064753), rec(0.991017964072)
AP: acc(0.681647940075), prec(0.707075030359), rec(0.912805673569)
PA: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA1: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA2: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PAK: acc(0.644194756554), prec(0.646526747226), rec(0.994011976048)
CW: acc(0.666666666667), prec(0.66296826533), rec(0.991017964072)
AL: acc(0.644194756554), prec(0.646526747226), rec(0.994011976048)


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(learning_rate=0.5, max_depth=5, n_estimators=300, loss='deviance')

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
fscore = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
print(acc)
print(fscore)

0.696815029834
0.778309422345


In [15]:
param_grid = {'learning_rate': [0.01, 0.1, 0.1, 0.5, 1.0],
              'max_depth':[1, 3, 5, 7, 9],
              'n_estimators': [100, 300, 500],
              'loss' : ['deviance', 'exponential'],
              'subsample':[0.2, 0.5, 0.8, 1],
              'max_features': [0.5, 1]}

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=5, verbose=3, n_jobs=3, scoring='accuracy')
grid_search.fit(data, target)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV] n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1 
[CV] n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1 
[CV] n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1 
[CV]  n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1, score=0.648148, total=   0.1s
[CV]  n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1, score=0.644860, total=   0.1s
[CV] n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1 
[CV] n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1 
[CV] n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=1 
[CV]  n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1, score=0.644860, total=   0.1s
[CV]  n_estimators=100, learning_rate=0.01, loss=deviance, max_depth=1, score=0.650943, total=   0.1s
[CV] n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=1 
[CV]  n_estimators=100, learning

[Parallel(n_jobs=3)]: Done  65 tasks      | elapsed:   21.5s


[CV]  n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9, score=0.564815, total=   1.8s
[CV] n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9 
[CV]  n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9, score=0.616822, total=   2.2s
[CV] n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9 
[CV]  n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9, score=0.682243, total=   2.3s
[CV] n_estimators=500, learning_rate=0.01, loss=deviance, max_depth=9 
[CV]  n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9, score=0.613208, total=   2.1s
[CV] n_estimators=500, learning_rate=0.01, loss=deviance, max_depth=9 
[CV]  n_estimators=300, learning_rate=0.01, loss=deviance, max_depth=9, score=0.641509, total=   2.0s
[CV] n_estimators=500, learning_rate=0.01, loss=deviance, max_depth=9 
[CV]  n_estimators=500, learning_rate=0.01, loss=deviance, max_depth=9, score=0.574074, total=   3.1s
[CV] n_estimators=500, learning_r

[Parallel(n_jobs=3)]: Done 161 tasks      | elapsed:  1.0min


[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=1, score=0.738318, total=   0.3s
[CV] n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=1 .
[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=1, score=0.688679, total=   0.3s
[CV] n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3, score=0.694444, total=   0.2s
[CV] n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=1, score=0.650943, total=   0.3s
[CV] n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3, score=0.719626, total=   0.2s
[CV] n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=3, score=0.691589, total=   0.2s
[CV] n_estimators=100, learning_rate=0.

[Parallel(n_jobs=3)]: Done 321 tasks      | elapsed:  1.6min


[CV]  n_estimators=300, learning_rate=0.1, loss=deviance, max_depth=3, score=0.679245, total=   0.6s
[CV] n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=300, learning_rate=0.1, loss=deviance, max_depth=3, score=0.707547, total=   0.6s
[CV] n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3, score=0.675926, total=   0.9s
[CV] n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3, score=0.682243, total=   0.9s
[CV] n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3 .
[CV]  n_estimators=500, learning_rate=0.1, loss=deviance, max_depth=3, score=0.719626, total=   0.9s
[CV] n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=5 .
[CV]  n_estimators=100, learning_rate=0.1, loss=deviance, max_depth=5, score=0.675926, total=   0.4s
[CV] n_estimators=100, learning_rate=0.

[Parallel(n_jobs=3)]: Done 545 tasks      | elapsed:  2.3min


[CV]  n_estimators=300, learning_rate=0.5, loss=exponential, max_depth=3, score=0.700935, total=   0.3s
[CV] n_estimators=300, learning_rate=0.5, loss=exponential, max_depth=3 
[CV]  n_estimators=300, learning_rate=0.5, loss=exponential, max_depth=3, score=0.663551, total=   0.5s
[CV] n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3 
[CV]  n_estimators=300, learning_rate=0.5, loss=exponential, max_depth=3, score=0.716981, total=   0.4s
[CV] n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3 
[CV]  n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3, score=0.657407, total=   0.4s
[CV] n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3 
[CV]  n_estimators=300, learning_rate=0.5, loss=exponential, max_depth=3, score=0.688679, total=   0.5s
[CV] n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3 
[CV]  n_estimators=500, learning_rate=0.5, loss=exponential, max_depth=3, score=0.691589, total=   0.4s
[CV] n_esti

[Parallel(n_jobs=3)]: Done 750 out of 750 | elapsed:  2.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [100, 300, 500], 'learning_rate': [0.01, 0.1, 0.1, 0.5, 1.0], 'loss': ['deviance', 'exponential'], 'max_depth': [1, 3, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [16]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=300, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
0.713483146067
{'n_estimators': 300, 'learning_rate': 0.5, 'loss': 'deviance', 'max_depth': 5}
