In [111]:
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from nltk.corpus import stopwords

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
readability = pd.read_csv('corpus_readability.csv.gz', compression='gzip')
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [112]:
# filter corpus
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

In [113]:
readability = readability.ix[corpus.index.values]

In [114]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'outro']
0    1
1   -1
Name: class, dtype: int64


In [115]:
from scipy.sparse import hstack

corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)
corpus_feat.drop('wc', axis=1,inplace=True)
liwc_data = corpus_feat.drop('class', 1)
liwc_data[liwc_data < 0] = 0

readability.drop('Unnamed: 0', axis=1,inplace=True)
readability = readability.drop('class', 1)
readability = readability.apply(pd.to_numeric, args=('coerce',))
readability = readability.replace('NaN',0)
readability[readability < 0] = 0

tfidf_data = TfidfVectorizer(max_features=900, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

data = hstack((tfidf_data, liwc_data))
data = hstack((data, readability_data.astype(float)))
data.shape

(534, 1003)

In [116]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

model = GaussianNB()

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.796435979547
0.730109390958
0.783193277311


In [117]:
model = svm.LinearSVC(C=50)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.814847374847
0.674770198355
0.697731092437


In [119]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import oll

## manual 10-fold cross-validation
kf = KFold(n_splits=10, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:

    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=1)
        
        X_train, X_test = data.toarray()[train_index], data.toarray()[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.653808525507), prec(0.653152559201), rec(0.990423387097)
AP: acc(0.648287910552), prec(0.651924152018), rec(0.98)
PA: acc(0.648183088749), prec(0.648183088749), rec(1.0)
PA1: acc(0.648183088749), prec(0.648183088749), rec(1.0)
PA2: acc(0.648183088749), prec(0.648183088749), rec(1.0)


  'precision', 'predicted', average, warn_for)


PAK: acc(0.351816911251), prec(0.0), rec(0.0)
CW: acc(0.659119496855), prec(0.720205718914), rec(0.77749752189)
AL: acc(0.648183088749), prec(0.648183088749), rec(1.0)
