In [78]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV, cross_val_score

readability = pd.read_csv('corpus_readability.csv.gz', compression='gzip')
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
readability = readability.ix[corpus.index.values]

In [79]:
# fix labels to binary
def classFit(x):
    if x['class'] == "diario":
        return 1
    else:
        return -1

readability['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
readability['class_'] = readability.apply(classFit,axis=1)
#readability = readability[readability['confidence'] == 1]
target = readability['class_'].values

print(readability['class'].values[:2])
print(readability['class_'][:2])

['diario' 'outro']
466    1
467   -1
Name: class_, dtype: int64


In [80]:
readability.drop('class', axis=1,inplace=True)
readability.drop('class_', axis=1,inplace=True)
readability.drop('Unnamed: 0', axis=1,inplace=True)
readability.drop('confidence', axis=1,inplace=True)

In [81]:
data = readability
data = data.apply(pd.to_numeric, args=('coerce',))
data = data.replace('NaN',0)
data[data < 0] = 0
data = data.as_matrix()

In [72]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.001)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.73243077621
0.62380664409
0.663865546218


In [76]:
model = svm.LinearSVC(C=2.15)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.688385753774
0.573351072408
0.592100840336


In [7]:
c_range = np.logspace(-3,3,7)
param_grid = [
    {'kernel': ['rbf', 'linear'], 'C': c_range},
]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=10, verbose=3, n_jobs=10)
grid_search.fit(data, target)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=rbf .............................................
[CV] .............. C=0.001, kernel=rbf, score=0.648148, total=   0.0s
[CV] C=0.001, kernel=rbf .............................................
[CV] C=0.001, kernel=linear ..........................................
[CV] C=0.001, 

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.7s


[CV] C=0.01, kernel=rbf ..............................................
[CV] ........... C=0.001, kernel=linear, score=0.740741, total=   0.5s
[CV] C=0.01, kernel=rbf ..............................................
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............... C=0.01, kernel=rbf, score=0.648148, total=   0.2s
[CV] C=0.01, kernel=rbf ..............................................
[CV] ............... C=0.01, kernel=rbf, score=0.648148, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] ........... C=0.001, kernel=linear, score=0.826923, total=   0.6s
[CV] ............... C=0.01, kernel=rbf, score=0.641509, total=   0.1s
[CV] ............... C=0.01, kernel=rbf, score=0.641509, total=   0.1s
[CV] C=0.01, kernel=rbf ..............................................
[CV] C=0.01, kernel=linear ...........................................
[CV] C=0.01, kernel=rbf ..............................................
[CV] .

[CV] ............... C=10.0, kernel=rbf, score=0.641509, total=   0.1s
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............... C=10.0, kernel=rbf, score=0.641509, total=   0.1s
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............... C=10.0, kernel=rbf, score=0.641509, total=   0.1s
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............... C=10.0, kernel=rbf, score=0.653846, total=   0.1s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............. C=0.1, kernel=linear, score=0.754717, total=  51.5s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............. C=0.1, kernel=linear, score=0.740741, total= 1.4min
[CV] C=10.0, kernel=linear ...........................................
[CV] ............. C=0.1, kernel=linear, score=0.648148, total= 1.6min
[CV] C=10.0, kernel=linear ...........................................
[CV] .

[CV] .......... C=1000.0, kernel=linear, score=0.814815, total=18.4min
[CV] ........... C=100.0, kernel=linear, score=0.685185, total=38.6min
[CV] ........... C=100.0, kernel=linear, score=0.641509, total=36.9min
[CV] .......... C=1000.0, kernel=linear, score=0.634615, total= 9.6min
[CV] .......... C=1000.0, kernel=linear, score=0.792453, total=11.8min
[CV] .......... C=1000.0, kernel=linear, score=0.622642, total=16.7min
[CV] .......... C=1000.0, kernel=linear, score=0.754717, total=24.6min
[CV] .......... C=1000.0, kernel=linear, score=0.722222, total=34.7min
[CV] .......... C=1000.0, kernel=linear, score=0.685185, total=27.8min
[CV] .......... C=1000.0, kernel=linear, score=0.660377, total=17.1min


[Parallel(n_jobs=10)]: Done 140 out of 140 | elapsed: 74.5min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'kernel': ['rbf', 'linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [8]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.74531835206
{'C': 0.01, 'kernel': 'linear'}


In [82]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll

## manual 10-fold cross-validation
kf = KFold(n_splits=10, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"CW" ,"PA" ,"PA1","PA2" ,"PAK" ,"AL"]

for m in methods:

    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=2.15)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)
        
        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.683857442348), prec(0.685057844774), rec(0.952522482677)
AP: acc(0.685779175402), prec(0.70324185548), rec(0.902496674507)


  'precision', 'predicted', average, warn_for)


CW: acc(0.353668763103), prec(0.0), rec(0.0)
PA: acc(0.623724668064), prec(0.713477770774), rec(0.717555209883)
PA1: acc(0.623724668064), prec(0.713477770774), rec(0.717555209883)
PA2: acc(0.625576519916), prec(0.714627196061), rec(0.720680209883)
PAK: acc(0.353668763103), prec(0.0), rec(0.0)
AL: acc(0.653948287911), prec(0.65536545339), rec(0.988070175439)
