In [1]:
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from nltk.corpus import stopwords

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
# filter corpus
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

In [3]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'outro']
0    1
1   -1
Name: class, dtype: int64


In [4]:
from scipy.sparse import hstack

corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)
corpus_feat.drop('wc', axis=1,inplace=True)
liwc_data = corpus_feat.drop('class', 1).values

tfidf_data = TfidfVectorizer(max_features=900, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

data = hstack((tfidf_data, liwc_data))
data.shape

(534, 964)

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.822620154495
0.751122130839
0.785966386555


In [17]:
model = svm.LinearSVC(C=50)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.742708974284
0.633075310434
0.814369747899


### confidence-weighted linear classifier (Dredze et al., 2008)

In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import oll

## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:
    model = oll.oll(m, C=1)

    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):

        X_train, X_test = data.toarray()[train_index], data.toarray()[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.651685393258), prec(0.652326615077), rec(0.988023952096)
AP: acc(0.691011235955), prec(0.693067596974), rec(0.960291707089)
PA: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA1: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA2: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)


  'precision', 'predicted', average, warn_for)


PAK: acc(0.352059925094), prec(0.0), rec(0.0)
CW: acc(0.698501872659), prec(0.771896722939), rec(0.772304552905)
AL: acc(0.679775280899), prec(0.676242062606), rec(0.976850767738)


In [5]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
fscore = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
print(precision)
print(fscore)

0.77768426797
0.808712052435
