In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus_feat['class'].values)
c, r = target.shape
target = target.reshape(c,)

In [2]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)
corpus_feat.drop('wc', axis=1,inplace=True)

In [3]:
# generate features matrix
data = corpus_feat.drop('class', 1).values

## Evaluating SVM

In [4]:
parameters = {'kernel':('rbf'), 'C':[1, 10]}
grid_search = GridSearchCV(SVC(), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............... C=1, kernel=linear, score=0.529940, total= 3.3min
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.634731, total=   0.1s
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.633634, total=   0.1s
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.633634, total=   0.1s
[CV] C=10, kernel=linear .............................................
[CV] ............... C=1, kernel=linear, score=0.684685, total= 5.0min
[CV] C=10, kernel=linear .............................................
[CV] ............

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: 

In [None]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [11]:
model = SVC(kernel='rbf',C=10,gamma=0.1)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.647960006451
0.647960006451
1.0


In [13]:
model = LinearSVC(C=1)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.744628638027
0.666801053594
0.707647058824


## Evaluating Naive Bayses

In [14]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.699760764395
0.638488415847
0.774285714286


In [4]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.818782558502
0.741650540235
0.774201680672


In [6]:
model.fit(data,target)
n = 15 

class_labels = ['outro','diario']
feature_names = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
topn_class1 = sorted(zip(model.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(model.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
    print (class_labels[0], coef, feat)

print()

for coef, feat in reversed(topn_class2):
    print (class_labels[1], coef, feat)

outro -8.75246574523 filler
outro -7.28064921067 assent
outro -7.15019632227 death
outro -6.960706276 family
outro -6.95477154048 anx
outro -6.81681209634 home
outro -6.56521134021 friend
outro -6.48439815712 future
outro -6.41478933242 we
outro -6.10199421433 sad
outro -6.00473706031 anger
outro -6.00359354961 relig
outro -5.92135545137 health
outro -5.89743335378 sexual
outro -5.68774060019 hear

diario -2.16661433869 funct
diario -2.3773192487 cogmech
diario -2.99452206876 relativ
diario -3.05477320521 social
diario -3.16675652691 pronoun
diario -3.27682440809 preps
diario -3.3289327161 incl
diario -3.41948818628 verb
diario -3.51678394967 ipron
diario -3.57460238893 ppron
diario -3.66829236923 space
diario -3.76592800271 conj
diario -3.8197919926 tentat
diario -3.89639788446 article
diario -3.90490976171 affect
