In [4]:
import pandas as pd
from liwc import LIWC
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import spearmanr
import numpy as np
from time import time

tweets_train = pd.read_csv('data/anger-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
tweets_dev = pd.read_csv('data/anger-ratings-0to1.dev.gold.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
tweets_train.append(tweets_dev)
tweets_test = pd.read_csv('data/anger-ratings-0to1.test.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
liwc_class = LIWC()

In [5]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

### SVM Train

In [15]:
features = liwc_class.build_features(tweets_train.tweet)
print(features.shape)
features

(857, 64)


array([[ 5.,  1.,  1., ...,  0.,  0.,  0.],
       [ 8.,  2.,  1., ...,  0.,  0.,  0.],
       [ 9.,  2.,  2., ...,  0.,  0.,  0.],
       ..., 
       [ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  1.,  1., ...,  0.,  0.,  0.]])

In [7]:
labels = tweets_train.score
model = svm.SVR()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.31794891,  0.16711541,  0.22282526,  0.30894308,  0.32912688])

### Grid Search

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
parameters = {'kernel':('linear','rbf')}

svr = svm.SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.31794891,  0.16711541,  0.22282526,  0.30894308,  0.32912688])

### Full Grid Search SVM

In [9]:
epsilon = np.logspace(-4,-1,4)
cc = np.logspace(-1, 2, 4)
gamma = np.logspace(-2,-1,2)

param_grid = [
    {'epsilon': epsilon, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma, 'C': cc},
    {'epsilon': epsilon, 'kernel': ['linear'], 'C': cc},
]

grid_search = GridSearchCV(svm.SVR(), param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=10, n_jobs=10, \
                           verbose=3)
start = time()

grid_search.fit(features, labels)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf ...................
[CV]  C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf, score=0.266149 -   0.2s
[CV]  C=0.1, epsilon=0.0001, gamma=0.01, kernel=rbf, score=-0.013517 -   0.2s
[CV] C=0.1, epsilon=0.0001, gamma=0.01, kernel=sigmoid ...............
[

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.7s


[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV]  C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.092326 -   0.3s
[CV]  C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.113752 -   0.3s
[CV]  C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.082800 -   0.4s
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=sigmoid ................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=sigmoid ................
[CV] C=0.1, epsilon=0.0001, gamma=0.1, kernel=sigmoid ................
[CV]  C=0.1, epsilon=0.0001, gamma=0.1, kernel=rbf, score=-0.1

[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=0.005501 -   0.3s
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=0.059563 -   0.3s
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.069884 -   0.3s
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.136489 -   0.3s
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.164926 -   0.3s
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV] C=0.1, epsilon=0.01, gamma=0.01, kernel=rbf .....................
[CV]  C=0.1, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.167268 -   0.3s
[CV]  C=0.1, epsilon=0.00

[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    4.4s


[CV]  C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.014594 -   0.1s
[CV]  C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.212818 -   0.1s
[CV]  C=0.1, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.135705 -   0.3s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.068118 -   0.1s
[CV]  C=0.1, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.069884 -   0.3s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=0.1, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.005501 -   0.3s
[CV]  C=0.1, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.167268 -   0.3s
[CV]  C=0.1, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.059532 -   0.3s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV] C=0.1, epsilon=0.1, gamma

[CV] C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid ...............
[CV] C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid ...............
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.063764 -   0.3s
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.120220 -   0.3s
[CV] C=1.0, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.012481 -   0.3s
[CV] C=1.0, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=1.0, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.083948 -   0.3s
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.011300 -   0.3s
[CV] C=1.0, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV] C=1.0, epsilon=0.0001, gamma=0.1, kernel=rbf ....................
[CV]  C=1.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.077422 -   0.3s
[CV] C=1.0, eps

[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=-0.026920 -   0.4s
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.061353 -   0.4s
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=-0.057024 -   0.4s
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.157139 -   0.4s
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.041252 -   0.4s
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.111917 -   0.4s
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV] C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid .................
[CV]  C=1.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.144991 -   0.3s
[CV]  C=1.0, epsilon=0.001, gamma=0.1, ker

[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:   10.6s


[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.059340 -   0.3s
[CV] C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.071361 -   0.3s
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.005501 -   0.3s
[CV] C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.165268 -   0.3s
[CV] C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV] C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.134922 -   0.3s
[CV] C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf ......................
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=-0.166830 -   0.4s
[CV]  C=1.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.041572 -   0.2s
[CV]  C=1.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.107573 -   0.3s
[CV] C=1.0, epsilon=0

[CV] C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid ..............
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=rbf, score=0.050568 -   1.2s
[CV] C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid ..............
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.063211 -   0.4s
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.121904 -   0.3s
[CV] C=10.0, epsilon=0.0001, gamma=0.1, kernel=rbf ...................
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.016580 -   0.3s
[CV] C=10.0, epsilon=0.0001, gamma=0.1, kernel=rbf ...................
[CV] C=10.0, epsilon=0.0001, gamma=0.1, kernel=rbf ...................
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.009621 -   0.3s
[CV] C=10.0, epsilon=0.0001, gamma=0.1, kernel=rbf ...................
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.084794 -   0.3s
[CV]  C=10.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.087689 -   0.3s
[

[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV]  C=10.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=-0.039294 -   0.4s
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV]  C=10.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.168707 -   0.4s
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV]  C=10.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=-0.042639 -   0.5s
[CV]  C=10.0, epsilon=0.001, gamma=0.1, kernel=rbf, score=0.101555 -   0.4s
[CV] C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid ................
[CV]  C=10.0, epsilon=0.001, gamma=0.1, kernel=sigmoid,

[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf .....................
[CV]  C=10.0, epsilon=0.01, gamma=0.1, kernel=sigmoid, score=0.107573 -   0.3s
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.099111 -   0.3s
[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf .....................
[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=sigmoid .................
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=-0.074791 -   0.3s
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.213449 -   0.3s
[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=sigmoid .................
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.090277 -   0.4s
[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=sigmoid .................
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=-0.082728 -   0.4s
[CV]  C=10.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.140806 -   0.4s
[CV] C=10.0, epsilon=0.1, gamma=0.01, kernel=sigmoid .................
[CV] C=10.0, epsilon=0.1, gamma=0.01, kerne

[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.087689 -   0.3s
[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.083948 -   0.3s
[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=-0.054694 -   0.3s
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................


[Parallel(n_jobs=10)]: Done 492 tasks      | elapsed:   24.7s


[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.076107 -   0.3s
[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.000044 -   0.3s
[CV]  C=100.0, epsilon=0.0001, gamma=0.01, kernel=sigmoid, score=0.076994 -   0.3s
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf ..................
[CV]  C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.137746 -   0.4s
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=sigmoid ..............
[CV]  C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.116068 -   0.4s
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=sigmoid ..............
[CV]  C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.095584 -   0.5s
[CV]  C=100.0, epsilon=0.0001, gamma=0.1, kernel=rbf, score=0.062879 -   0.4s
[CV] C=100.0, epsilon=0.0001, gamma=0.1, kernel=sigmoid ..............
[CV]  C=100.0

[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.035085 -   0.4s
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=0.058936 -   0.3s
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.070623 -   0.3s
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.166830 -   0.3s
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.133334 -   0.3s
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV] C=100.0, epsilon=0.01, gamma=0.01, kernel=rbf ...................
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=0.004654 -   0.4s
[CV]  C=100.0, epsilon=0.001, gamma=0.1, kernel=sigmoid, score=-0.165268 -   0.3s
[C

[CV] C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid ................
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid, score=0.063211 -   0.4s
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid, score=-0.121062 -   0.3s
[CV] C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid ................
[CV] C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid ................
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.087071 -   0.6s
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=-0.096277 -   0.6s
[CV] C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid ................
[CV] C=100.0, epsilon=0.1, gamma=0.1, kernel=rbf .....................
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=rbf, score=0.028108 -   0.6s
[CV] C=100.0, epsilon=0.1, gamma=0.1, kernel=rbf .....................
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid, score=-0.015845 -   0.4s
[CV]  C=100.0, epsilon=0.1, gamma=0.01, kernel=sigmoid, score=0.010036 -   0.4s
[CV]  C=100.0, epsilon=

[CV] ..... C=0.1, epsilon=0.01, kernel=linear, score=0.043298 -   1.1s
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] ..... C=0.1, epsilon=0.01, kernel=linear, score=0.145439 -   1.1s
[CV] ..... C=0.1, epsilon=0.01, kernel=linear, score=0.330560 -   1.2s
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] ..... C=0.1, epsilon=0.01, kernel=linear, score=0.120264 -   1.0s
[CV] .... C=0.1, epsilon=0.01, kernel=linear, score=-0.166712 -   1.1s
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] ..... C=0.1, epsilon=0.01, kernel=linear, score=0.124952 -   1.1s
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] .... C=0.1, epsilon=0.01, kernel=linear, score=-0.008851 -   1.1s
[CV] C=0.1, epsilon=0.1, kernel=linear ...............................
[CV] .

[CV] .. C=10.0, epsilon=0.0001, kernel=linear, score=0.334923 - 1.2min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] .. C=10.0, epsilon=0.0001, kernel=linear, score=0.048367 - 1.2min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] . C=10.0, epsilon=0.0001, kernel=linear, score=-0.002504 - 1.3min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] .. C=10.0, epsilon=0.0001, kernel=linear, score=0.127868 - 1.4min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] . C=10.0, epsilon=0.0001, kernel=linear, score=-0.157651 - 1.4min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] .. C=10.0, epsilon=0.0001, kernel=linear, score=0.201817 - 1.4min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] .. C=10.0, epsilon=0.0001, kernel=linear, score=0.140848 - 1.4min
[CV] C=10.0, epsilon=0.001, kernel=linear ............................
[CV] .

[CV] .. C=100.0, epsilon=0.001, kernel=linear, score=0.128753 -10.8min
[CV] C=100.0, epsilon=0.01, kernel=linear ............................


[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 30.7min


[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.024778 - 8.5min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.006091 -10.6min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.037562 - 8.7min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.108675 -10.2min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.164785 - 8.8min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] ... C=100.0, epsilon=0.01, kernel=linear, score=0.318026 -11.9min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] .. C=100.0, epsilon=0.01, kernel=linear, score=-0.167681 -10.7min
[CV] C=100.0, epsilon=0.1, kernel=linear .............................
[CV] .

[Parallel(n_jobs=10)]: Done 800 out of 800 | elapsed: 45.9min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'kernel': ['rbf', 'sigmoid'], 'gamma': array([ 0.01,  0.1 ]), 'C': array([   0.1,    1. ,   10. ,  100. ])}, {'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'kernel': ['linear'], 'C': array([   0.1,    1. ,   10. ,  100. ])}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [10]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVR(C=0.10000000000000001, cache_size=200, coef0=0.0, degree=3,
  epsilon=0.10000000000000001, gamma='auto', kernel='linear', max_iter=-1,
  shrinking=True, tol=0.001, verbose=False)
0.09944778114260006
{'C': 0.10000000000000001, 'epsilon': 0.10000000000000001, 'kernel': 'linear'}


### GradientBoostingRegressor

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor(n_estimators=50, max_depth=1, loss='ls')
param_grid = {'learning_rate': [0.1, 0.05, 0.02]}
grid_search = GridSearchCV(clf, param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=5, n_jobs=5, \
                           verbose=3)
grid_search.fit(features, labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] ...................... learning_rate=0.1, score=0.248317 -   0.1s
[CV] ...................... learning_rate=0.1, score=0.144253 -   0.1s
[CV] ...................... learning_rate=0.1, score=0.245060 -   0.2s
[CV] ...................... learning_rate=0.1, score=0.189653 -   0.2s
[CV] learning_rate=0.05 ..............................................
[CV] learning_rate=0.05 ..............................................
[CV] learning_rate=0.05 ..............................................
[CV] ...................... learning_rate=0.1, score=0.170426 -   0.2s
[CV] learning_rat

[Parallel(n_jobs=5)]: Done   6 out of  15 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'learning_rate': [0.1, 0.05, 0.02]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [12]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
0.19958327910902954
{'learning_rate': 0.1}


### LinearSVR

In [13]:
from sklearn.svm import LinearSVR

clf = LinearSVR()
param_grid = {'epsilon': epsilon, 'C': cc}
grid_search = GridSearchCV(clf, param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=5, n_jobs=5, \
                           verbose=3)
grid_search.fit(features, labels)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=0.1, epsilon=0.0001 ...........................................
[CV] C=0.1, epsilon=0.0001 ...........................................
[CV] C=0.1, epsilon=0.0001 ...........................................
[CV] C=0.1, epsilon=0.0001 ...........................................
[CV] C=0.1, epsilon=0.0001 ...........................................
[CV] .................. C=0.1, epsilon=0.0001, score=0.125395 -   0.1s
[CV] .................. C=0.1, epsilon=0.0001, score=0.031679 -   0.1s
[CV] .................. C=0.1, epsilon=0.0001, score=0.189885 -   0.1s
[CV] C=0.1, epsilon=0.001 ............................................
[CV] .................. C=0.1, epsilon=0.0001, score=0.292125 -   0.1s
[CV] C=0.1, epsilon=0.001 ............................................
[CV] C=0.1, epsilon=0.001 ............................................
[CV] C=0.1, epsilon=0.001 ............................................
[CV] ...........

[Parallel(n_jobs=5)]: Done  34 tasks      | elapsed:    1.3s


[CV] C=10.0, epsilon=0.0001 ..........................................
[CV] ................. C=10.0, epsilon=0.0001, score=0.072949 -   0.2s
[CV] C=10.0, epsilon=0.001 ...........................................
[CV] ................. C=10.0, epsilon=0.001, score=-0.004145 -   0.1s
[CV] C=10.0, epsilon=0.001 ...........................................
[CV] .................. C=10.0, epsilon=0.001, score=0.104063 -   0.2s
[CV] C=10.0, epsilon=0.001 ...........................................
[CV] ................. C=10.0, epsilon=0.0001, score=0.075683 -   0.2s
[CV] ................. C=10.0, epsilon=0.0001, score=0.082951 -   0.2s
[CV] C=10.0, epsilon=0.01 ............................................
[CV] C=10.0, epsilon=0.01 ............................................
[CV] .................. C=10.0, epsilon=0.001, score=0.262149 -   0.2s
[CV] .................. C=10.0, epsilon=0.001, score=0.060382 -   0.2s
[CV] C=10.0, epsilon=0.01 ............................................
[CV] C

[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed:    2.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'C': array([   0.1,    1. ,   10. ,  100. ])},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [14]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

LinearSVR(C=0.10000000000000001, dual=True, epsilon=0.10000000000000001,
     fit_intercept=True, intercept_scaling=1.0, loss='epsilon_insensitive',
     max_iter=1000, random_state=None, tol=0.0001, verbose=0)
0.18506907047874463
{'C': 0.10000000000000001, 'epsilon': 0.10000000000000001}


### Train and Save Prediction

In [16]:
for emotion in ['joy','anger','sadness','fear']:
    model = svm.LinearSVR(epsilon=0.1, C=0.1)
      
    tweets_train = pd.read_csv('data/' + emotion + '-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    tweets_dev = pd.read_csv('data/' + emotion + '-ratings-0to1.dev.gold.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    tweets_train.append(tweets_dev)
    features = liwc_class.build_features(tweets_train.tweet)
    labels = tweets_train.score
    model.fit(features, labels)

    tweets_test = pd.read_csv('data/' + emotion + '-ratings-0to1.test.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    test_features = liwc_class.build_features(tweets_test.tweet)
    tweets_test['score'] = model.predict(test_features)
    tweets_test.to_csv('submission-svm/' + emotion + '-pred.txt', sep='\t', header=False)