In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

# filter corpus
corpus_feat = corpus_feat.reset_index()

# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus_feat['class'].values)
c, r = target.shape
target = target.reshape(c,)

# generate features matrix
corpus_feat.drop(corpus_feat.columns[0], axis=1,inplace=True)
corpus_feat.drop(corpus_feat.columns[0], axis=1,inplace=True)
data = corpus_feat.drop('class', 1).values

## Evaluating SVM

In [4]:
parameters = {'kernel':('rbf'), 'C':[1, 10]}
grid_search = GridSearchCV(SVC(), parameters, cv=3, n_jobs=3, verbose=3, scoring='accuracy')

grid_search.fit(data, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............... C=1, kernel=linear, score=0.529940, total= 3.3min
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.634731, total=   0.1s
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.633634, total=   0.1s
[CV] C=1, kernel=rbf .................................................
[CV] .................. C=1, kernel=rbf, score=0.633634, total=   0.1s
[CV] C=10, kernel=linear .............................................
[CV] ............... C=1, kernel=linear, score=0.684685, total= 5.0min
[CV] C=10, kernel=linear .............................................
[CV] ............

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: 

In [None]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [6]:
model = SVC(kernel='rbf',C=10,gamma=0.1)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.63401080108
0.63401080108
1.0


In [2]:
model = LinearSVC(C=10)

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.443601987926
0.584936693669
0.548908730159


## Evaluating Naive Bayses

In [2]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.675640802726
0.621199119912
0.773090277778


In [3]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

precision = cross_val_score(model, data, target, cv=10, scoring='precision').mean()
acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
recall = cross_val_score(model, data, target, cv=10, scoring='recall').mean()
print(precision)
print(acc)
print(recall)

0.622148678272
0.574944294429
0.592286706349
