In [33]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [34]:
corpus_feat['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_feat['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
corpus_feat['wc'] = corpus['contentcount']
corpus_feat['judges'] = corpus['_trusted_judgments']

In [35]:
corpus_feat.drop(corpus_feat.columns[0], axis=1,inplace=True)
corpus_feat.drop(corpus_feat.columns[0], axis=1,inplace=True)
corpus_feat = corpus_feat[corpus_feat.wc.apply(lambda x: str(x).isnumeric())]
corpus_feat = corpus_feat[corpus_feat['judges'] == 3]

wc_vector = corpus_feat['wc']
class_vector = corpus_feat['class']

corpus_feat.drop('class',axis=1,inplace=True)
corpus_feat.drop('judges',axis=1,inplace=True)
corpus_feat.drop('confidence',axis=1,inplace=True)
corpus_feat.drop('wc',axis=1,inplace=True)

In [36]:
data = corpus_feat.as_matrix().astype(float) / wc_vector.as_matrix().astype(float)[:, np.newaxis]

In [37]:
# fix labels to binary
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(class_vector.values)
c, r = target.shape
target = target.reshape(c,)

## Evaluating SVM

In [38]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid_search = GridSearchCV(SVC(), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] kernel=linear, C=1 ..............................................
[CV] kernel=linear, C=1 ..............................................
[CV] kernel=linear, C=1 ..............................................
[CV] ............... kernel=linear, C=1, score=0.634868, total=   0.0s
[CV] ............... kernel=linear, C=1, score=0.634868, total=   0.0s
[CV] kernel=rbf, C=1 .................................................
[CV] ............... kernel=linear, C=1, score=0.635762, total=   0.0s
[CV] kernel=rbf, C=1 .................................................
[CV] kernel=rbf, C=1 .................................................
[CV] .................. kernel=rbf, C=1, score=0.634868, total=   0.1s
[CV] kernel=linear, C=10 .............................................
[CV] .................. kernel=rbf, C=1, score=0.634868, total=   0.1s
[CV] .................. kernel=rbf, C=1, score=0.635762, total=   0.1s
[CV] ............

[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.5s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [39]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.654945054945
{'kernel': 'linear', 'C': 10}


## Evaluating Naive Bayses

In [40]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(acc)
print(recall)

0.372533312099
0.0173018753781


In [41]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

f1 = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(f1)
print(acc)
print(recall)

0.77522092864
0.632973403408
0.996551724138
