In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [2]:
# fix labels to binary
def classFit(x):
    if x['class'] == "diario":
        return 1
    else:
        return -1
    
corpus_feat['class_'] = corpus_feat.apply(classFit,axis=1)
target = corpus_feat['class_'].values

print(corpus_feat['class'].values[:2])
print(corpus_feat['class_'][:2])

['diario' 'outro']
0    1
1   -1
Name: class_, dtype: int64


In [3]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)
corpus_feat.drop('class', axis=1,inplace=True)
#corpus_feat = corpus_feat[corpus_feat.wc.apply(lambda x: str(x).isnumeric())]
wc_vector = corpus_feat['wc']
corpus_feat.drop('wc', axis=1,inplace=True)

In [4]:
# generate features matrix
data = corpus_feat.drop('class_', 1).values
data.shape

(534, 64)

## Evaluating SVM

In [6]:
parameters = {'C':[1, 10]}
grid_search = GridSearchCV(SVC(kernel='rbf'), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] .............................. C=1, score=0.648045, total=   0.0s
[CV] .............................. C=1, score=0.646067, total=   0.0s
[CV] C=10 ............................................................
[CV] C=10 ............................................................
[CV] .............................. C=1, score=0.649718, total=   0.0s
[CV] C=10 ............................................................
[CV] ............................. C=10, score=0.648045, total=   0.1s
[CV] ............................. C=10, score=0.649718, total=   0.1s
[CV] ............................. C=10, score=0.646067, total=   0.1s


[Parallel(n_jobs=3)]: Done   4 out of   6 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:    0.2s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=3, param_grid={'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.647940074906
{'C': 1}


In [8]:
model = SVC(kernel='rbf',C=1,gamma=0.1)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.647960006451
0.647960006451
1.0


In [9]:
model = LinearSVC(C=1)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.740516135105
0.670680804171
0.761008403361


## Evaluating Naive Bayses

In [10]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.699760764395
0.638488415847
0.774285714286


In [11]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.818782558502
0.741650540235
0.774201680672


In [12]:
model.fit(data,target)
n = 15 

class_labels = ['outro','diario']
feature_names = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -7.9130506381 filler
outro -7.48738282267 death
outro -7.26423927136 assent
outro -6.89959615777 family
outro -6.80092463026 we
outro -6.68388758494 anx
outro -6.52675627698 home
outro -6.50780435156 relig
outro -6.49784162906 future
outro -6.28141201568 friend
outro -6.26458826411 anger
outro -6.02335611301 health
outro -6.0156755837 sad
outro -5.76045821478 hear
outro -5.69229556335 sexual

diario -2.15661440035 funct
diario -2.38244796028 cogmech
diario -2.94462009681 relativ
diario -3.10032718798 social
diario -3.14483360076 pronoun
diario -3.34594265292 verb
diario -3.38490476055 incl
diario -3.41659124489 preps
diario -3.54224746579 ipron
diario -3.62301063072 ppron
diario -3.75011010854 space
diario -3.75792140284 conj
diario -3.76845924721 tentat
diario -3.91325973919 affect
diario -3.94401095223 time


### confidence-weighted linear classifier (Dredze et al., 2008)

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll
import numpy as np

## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

methods = ["P" ,"AP" ,"PA" ,"PA1","PA2" ,"PAK","CW" ,"AL"]

for m in methods:

    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(data):
        model = oll.oll(m, C=1)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(m + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

P: acc(0.651685393258), prec(0.652326615077), rec(0.988023952096)
AP: acc(0.666666666667), prec(0.742554564025), rec(0.831800086977)
PA: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA1: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)
PA2: acc(0.64606741573), prec(0.647236067697), rec(0.997005988024)


  'precision', 'predicted', average, warn_for)


PAK: acc(0.352059925094), prec(0.0), rec(0.0)
CW: acc(0.662921348315), prec(0.695310412399), rec(0.852122570501)
AL: acc(0.647940074906), prec(0.649877807181), rec(0.988023952096)
