In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
corpus_feat.shape

(534, 68)

In [2]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)

In [3]:
# fix labels to binary
def classFit(x):
    if x['class'] == "diario":
        return 1
    else:
        return -1
    
corpus_feat['class_'] = corpus_feat.apply(classFit,axis=1)
target = corpus_feat['class_'].values

print(corpus_feat['class'].values[:2])
print(corpus_feat['class_'][:2])

['diario' 'outro']
0    1
1   -1
Name: class_, dtype: int64


In [4]:
class_vector = corpus_feat['class']

corpus_feat.drop('class',axis=1,inplace=True)
corpus_feat.drop('class_',axis=1,inplace=True)
corpus_feat.drop('wc',axis=1,inplace=True)

In [5]:
corpus_feat.ix[0]

funct      73.0
pronoun    25.0
ppron      13.0
i           3.0
we          0.0
you        10.0
shehe       4.0
they        5.0
ipron      16.0
article     9.0
verb       19.0
auxverb     5.0
past        8.0
present    10.0
future      2.0
adverb      2.0
preps      32.0
conj       12.0
negate      1.0
quant       6.0
number      0.0
swear       4.0
social     27.0
family      1.0
friend      2.0
humans     11.0
affect     14.0
posemo      5.0
negemo      6.0
anx         1.0
           ... 
cause       8.0
discrep     9.0
tentat      7.0
certain     5.0
inhib      11.0
incl       32.0
excl        8.0
percept    12.0
see         5.0
hear        0.0
feel        4.0
bio        11.0
body        5.0
health      3.0
sexual      4.0
ingest     11.0
relativ    38.0
motion      8.0
space      25.0
time       19.0
work        2.0
achieve     4.0
leisure     1.0
home        1.0
money       4.0
relig       0.0
death       0.0
assent      0.0
nonfl       2.0
filler      0.0
Name: 0, dtype: float64

In [6]:
corpus_feat.sum(axis=0)

funct      102564.0
pronoun     37998.0
ppron       24229.0
i            5358.0
we           1174.0
you         15268.0
shehe       14041.0
they         3999.0
ipron       26023.0
article     16791.0
verb        30461.0
auxverb     12373.0
past         8469.0
present     17143.0
future       1346.0
adverb       7617.0
preps       30952.0
conj        20695.0
negate       3412.0
quant       11134.0
number       3167.0
swear       11493.0
social      40817.0
family        874.0
friend       1499.0
humans      14024.0
affect      17833.0
posemo      11615.0
negemo       5512.0
anx          1006.0
             ...   
cause        9480.0
discrep     12908.0
tentat      20134.0
certain      5268.0
inhib        9298.0
incl        30839.0
excl        14732.0
percept     11694.0
see          3429.0
hear         2885.0
feel         4561.0
bio         13079.0
body         5507.0
health       2245.0
sexual       2777.0
ingest      13878.0
relativ     45918.0
motion      13172.0
space       21634.0


In [7]:
for col in corpus_feat.columns:
    corpus_feat[col] = corpus_feat[col]/corpus_feat[col].sum()

In [8]:
corpus_feat.ix[0]

funct      0.000712
pronoun    0.000658
ppron      0.000537
i          0.000560
we         0.000000
you        0.000655
shehe      0.000285
they       0.001250
ipron      0.000615
article    0.000536
verb       0.000624
auxverb    0.000404
past       0.000945
present    0.000583
future     0.001486
adverb     0.000263
preps      0.001034
conj       0.000580
negate     0.000293
quant      0.000539
number     0.000000
swear      0.000348
social     0.000661
family     0.001144
friend     0.001334
humans     0.000784
affect     0.000785
posemo     0.000430
negemo     0.001089
anx        0.000994
             ...   
cause      0.000844
discrep    0.000697
tentat     0.000348
certain    0.000949
inhib      0.001183
incl       0.001038
excl       0.000543
percept    0.001026
see        0.001458
hear       0.000000
feel       0.000877
bio        0.000841
body       0.000908
health     0.001336
sexual     0.001440
ingest     0.000793
relativ    0.000828
motion     0.000607
space      0.001156


In [9]:
data = corpus_feat.as_matrix().astype(float)
data.shape

(534, 64)

## Evaluating SVM

In [10]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid_search = GridSearchCV(SVC(), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............... C=1, kernel=linear, score=0.649718, total=   0.0s
[CV] ............... C=1, kernel=linear, score=0.646067, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] C=1, kernel=rbf .................................................
[CV] ............... C=1, kernel=linear, score=0.648045, total=   0.0s
[CV] .................. C=1, kernel=rbf, score=0.646067, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] C=10, kernel=linear .............................................
[CV] .................. C=1, kernel=rbf, score=0.648045, total=   0.0s
[CV] .............. C=10, kernel=linear, score=0.648045, total=   0.0s
[CV] C=10, kernel

[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:    0.2s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [11]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.647940074906
{'C': 1, 'kernel': 'linear'}


## Evaluating Naive Bayses

In [12]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.699760764395
0.638488415847
0.774285714286


In [13]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
fscore = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
print(precision)
print(acc)
print(recall)
print(fscore)

0.647960006451
0.647960006451
1.0
0.786371486324


In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll
import numpy as np

## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:
    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=1)
            
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.709737827715), prec(0.718379115118), rec(0.910581072492)
AP: acc(0.724719101124), prec(0.750853825137), rec(0.87727896163)
PA: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA1: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA2: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PAK: acc(0.647940074906), prec(0.649056603774), rec(0.994212691935)
CW: acc(0.692883895131), prec(0.69054129323), rec(0.968872311243)
AL: acc(0.722846441948), prec(0.733480419195), rec(0.9051951962)


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
fscore = cross_val_score(model, data, target, cv=10, scoring='f1').mean()
print(precision)
print(fscore)

0.755301940766
0.804989391302
