In [1]:
import pandas as pd
from liwc import LIWC
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import spearmanr
import numpy as np
from time import time

tweets_train = pd.read_csv('data/anger-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
tweets_test = pd.read_csv('data/anger-ratings-0to1.dev.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
liwc_class = LIWC()



In [2]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

### SVM Train

In [3]:
features = liwc_class.build_features(tweets_train.tweet)
features

array([[  4.,   1.,   1., ...,   0.,   0.,   0.],
       [ 11.,   6.,   6., ...,   0.,   0.,   0.],
       [  8.,   4.,   4., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  6.,   3.,   3., ...,   0.,   0.,   0.],
       [  4.,   2.,   2., ...,   0.,   0.,   0.]])

In [4]:
labels = tweets_train.score
model = svm.SVR()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([-0.13835567, -0.09737484, -0.07454812,  0.22508702, -0.12807968])

### Grid Search

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
parameters = {'kernel':('linear','rbf')}

svr = svm.SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.03925187, -0.01563636, -0.14625448, -0.08287429, -0.1569095 ])

### Full Grid Search SVM

In [6]:
epsilon = np.logspace(-4,-1,4)
cc = np.logspace(-1, 2, 4)
gamma = np.logspace(-2,-1,2)

param_grid = [
    {'epsilon': epsilon, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma, 'C': cc},
    {'epsilon': epsilon, 'kernel': ['linear'], 'C': cc},
]

grid_search = GridSearchCV(svm.SVR(), param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=10, n_jobs=10, \
                           verbose=3)
start = time()

grid_search.fit(features, labels)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf ...................
[CV]  epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf, score=-0.299291 -   0.2s
[CV]  epsilon=0.0001, gamma=0.01, C=0.1, kernel=rbf, score=-0.107314 -   0.2s
[CV] epsilon=0.0001, gamma=0.01, C=0.1, kernel=sigmoid ...............


[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.4s


[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf, score=0.211252 -   0.2s
[CV] epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf ....................
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=sigmoid, score=0.006289 -   0.1s
[CV] epsilon=0.0001, gamma=0.1, C=0.1, kernel=sigmoid ................
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf, score=-0.043149 -   0.1s
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf, score=-0.132176 -   0.3s
[CV] epsilon=0.001, gamma=0.01, C=0.1, kernel=rbf ....................
[CV] epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf ....................
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=sigmoid, score=0.038043 -   0.2s
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=sigmoid, score=0.075043 -   0.1s
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf, score=0.134187 -   0.3s
[CV] epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf ....................
[CV]  epsilon=0.0001, gamma=0.1, C=0.1, kernel=rbf, score=-0.199484 -   0.3s
[CV]  epsilon=0.0001, 

[Parallel(n_jobs=10)]: Done 196 tasks      | elapsed:    4.4s


[CV] epsilon=0.001, gamma=0.01, C=1.0, kernel=sigmoid ................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=rbf, score=0.124772 -   0.3s
[CV] epsilon=0.001, gamma=0.01, C=1.0, kernel=rbf ....................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=rbf, score=-0.056657 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=1.0, kernel=rbf .....................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=sigmoid, score=0.011765 -   0.1s
[CV] epsilon=0.001, gamma=0.1, C=1.0, kernel=rbf .....................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=rbf, score=-0.091072 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=1.0, kernel=rbf .....................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=sigmoid, score=0.031796 -   0.2s
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=sigmoid, score=0.169312 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=1.0, kernel=rbf .....................
[CV]  epsilon=0.001, gamma=0.01, C=1.0, kernel=rbf, score=-0.333954 -   0.3s
[CV]  epsilon=0.001, gamma=

[Parallel(n_jobs=10)]: Done 516 tasks      | elapsed:   14.7s


[CV]  epsilon=0.0001, gamma=0.1, C=100.0, kernel=rbf, score=-0.055128 -   1.5s
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid, score=0.169312 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=100.0, kernel=rbf ...................
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid, score=0.060866 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=100.0, kernel=rbf ...................
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=rbf, score=0.025955 -   0.6s
[CV] epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid ..............
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid, score=-0.025822 -   0.2s
[CV] epsilon=0.001, gamma=0.1, C=100.0, kernel=rbf ...................
[CV] epsilon=0.001, gamma=0.1, C=100.0, kernel=rbf ...................
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid, score=0.043561 -   0.3s
[CV]  epsilon=0.001, gamma=0.01, C=100.0, kernel=rbf, score=-0.099598 -   1.0s
[CV] epsilon=0.001, gamma=0.01, C=100.0, kernel=sigmoid ..............
[CV]  eps

[Parallel(n_jobs=10)]: Done 757 tasks      | elapsed:  1.2min


[CV] .... epsilon=0.1, C=10.0, kernel=linear, score=-0.048364 -   8.9s
[CV] epsilon=0.0001, C=100.0, kernel=linear ..........................
[CV] .... epsilon=0.1, C=10.0, kernel=linear, score=-0.031486 -   9.7s
[CV] epsilon=0.0001, C=100.0, kernel=linear ..........................
[CV] ..... epsilon=0.1, C=10.0, kernel=linear, score=0.117422 -   7.9s
[CV] epsilon=0.0001, C=100.0, kernel=linear ..........................
[CV] . epsilon=0.0001, C=100.0, kernel=linear, score=0.199668 -  58.1s
[CV] epsilon=0.001, C=100.0, kernel=linear ...........................
[CV] . epsilon=0.0001, C=100.0, kernel=linear, score=0.026355 - 1.5min
[CV] epsilon=0.001, C=100.0, kernel=linear ...........................
[CV] . epsilon=0.0001, C=100.0, kernel=linear, score=0.050170 - 1.5min
[CV] epsilon=0.001, C=100.0, kernel=linear ...........................
[CV] . epsilon=0.0001, C=100.0, kernel=linear, score=0.058725 - 1.6min
[CV] epsilon=0.001, C=100.0, kernel=linear ...........................
[CV]  

[Parallel(n_jobs=10)]: Done 800 out of 800 | elapsed:  9.2min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'gamma': array([ 0.01,  0.1 ]), 'C': array([   0.1,    1. ,   10. ,  100. ]), 'kernel': ['rbf', 'sigmoid']}, {'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'C': array([   0.1,    1. ,   10. ,  100. ]), 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVR(C=0.10000000000000001, cache_size=200, coef0=0.0, degree=3,
  epsilon=0.0001, gamma=0.01, kernel='sigmoid', max_iter=-1,
  shrinking=True, tol=0.001, verbose=False)
0.054604609616613856
{'epsilon': 0.0001, 'gamma': 0.01, 'C': 0.10000000000000001, 'kernel': 'sigmoid'}


### GradientBoostingRegressor

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor(n_estimators=50, max_depth=1, loss='ls')
param_grid = {'learning_rate': [0.1, 0.05, 0.02]}
grid_search = GridSearchCV(clf, param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=5, n_jobs=5, \
                           verbose=3)
grid_search.fit(features, labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] ...................... learning_rate=0.1, score=0.021092 -   0.0s
[CV] ..................... learning_rate=0.1, score=-0.104335 -   0.0s
[CV] learning_rate=0.05 ..............................................
[CV] ..................... learning_rate=0.1, score=-0.116172 -   0.0s
[CV] ...................... learning_rate=0.1, score=0.022159 -   0.0s
[CV] learning_rate=0.05 ..............................................
[CV] ...................... learning_rate=0.1, score=0.014715 -   0.0s
[CV] .................... learning_rate=0.05, score=-0.051568 -   0.0s
[CV] learning_rat

[Parallel(n_jobs=5)]: Done   6 out of  15 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'learning_rate': [0.1, 0.05, 0.02]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [9]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
-0.015690007630196748
{'learning_rate': 0.05}


### Train and Save Prediction

In [11]:
for emotion in ['joy','anger','sadness','fear']:
    model = svm.SVR(kernel='sigmoid', epsilon=0.0001, C=0.1)
    tweets_train = pd.read_csv('data/' + emotion + '-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    features = liwc_class.build_features(tweets_train.tweet)
    labels = tweets_train.score
    model.fit(features, labels)

    tweets_test = pd.read_csv('data/' + emotion + '-ratings-0to1.dev.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    test_features = liwc_class.build_features(tweets_test.tweet)
    tweets_test['score'] = model.predict(test_features)
    tweets_test.to_csv('submission-svm/' + emotion + '-pred.txt', sep='\t', header=False)