In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]

In [4]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'outro']
466    1
467   -1
Name: class, dtype: int64


In [5]:
data = np.random.random((len(target), 100))

In [6]:
model = MultinomialNB()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()

print('acc(' + str(round(acc,4)) 
      + '), precision(' + str(round(precision,4)) 
      + '), recall(' + str(round(recall,4)) + ')') 

acc(0.648), precision(0.648), recall(1.0)


In [7]:
model = LinearSVC()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()

print('acc(' + str(round(acc,4)) 
      + '), precision(' + str(round(precision,4)) 
      + '), recall(' + str(round(recall,4)) + ')') 

acc(0.5954), precision(0.6633), recall(0.7661)


In [8]:
model = SVC(kernel='rbf')

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()

print('acc(' + str(round(acc,4)) 
      + '), precision(' + str(round(precision,4)) 
      + '), recall(' + str(round(recall,4)) + ')') 

acc(0.648), precision(0.648), recall(1.0)


In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll
import numpy as np

## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:

    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=1)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.644194756554), prec(0.653914197258), rec(0.95229652427)
AP: acc(0.640449438202), prec(0.657328491939), rec(0.927156859465)
PA: acc(0.647940074906), prec(0.648413539679), rec(0.997005988024)
PA1: acc(0.647940074906), prec(0.648413539679), rec(0.997005988024)
PA2: acc(0.647940074906), prec(0.648413539679), rec(0.997005988024)


  'precision', 'predicted', average, warn_for)


PAK: acc(0.352059925094), prec(0.0), rec(0.0)
CW: acc(0.64606741573), prec(0.663002732623), rec(0.917371959991)
AL: acc(0.629213483146), prec(0.652206080172), rec(0.920382698291)
