In [1]:
import pandas as pd
from liwc import LIWC
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import spearmanr
import numpy as np
from time import time

tweets_train = pd.read_csv('data/anger-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
tweets_test = pd.read_csv('data/anger-ratings-0to1.dev.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
liwc_class = LIWC()



In [2]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

### SVM Train

In [3]:
features = liwc_class.build_features(tweets_train.tweet)
features

array([[ 5.,  1.,  1., ...,  0.,  0.,  0.],
       [ 8.,  2.,  1., ...,  0.,  0.,  0.],
       [ 9.,  2.,  2., ...,  0.,  0.,  0.],
       ..., 
       [ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  1.,  1., ...,  0.,  0.,  0.]])

In [4]:
labels = tweets_train.score
model = svm.SVR()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.31794891,  0.16711541,  0.22282526,  0.30894308,  0.32912688])

### Grid Search

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
parameters = {'kernel':('linear','rbf')}

svr = svm.SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.31794891,  0.16711541,  0.22282526,  0.30894308,  0.32912688])

### Full Grid Search SVM

In [6]:
epsilon = np.logspace(-4,-1,4)
cc = np.logspace(-1, 2, 4)
gamma = np.logspace(-2,-1,2)

param_grid = [
    {'epsilon': epsilon, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma, 'C': cc},
    {'epsilon': epsilon, 'kernel': ['linear'], 'C': cc},
]

grid_search = GridSearchCV(svm.SVR(), param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=10, n_jobs=10, \
                           verbose=3)
start = time()

grid_search.fit(features, labels)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV] C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01 ...................
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01, score=-0.009727 -   0.2s
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.01 ...............
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.01, score=0.176475 -   0.3s
[

[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.5s


[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.082800 -   0.2s
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ................
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.092326 -   0.2s
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ................
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.015670 -   0.2s
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.113752 -   0.3s
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ................
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ................
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.090073 -   0.2s
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=-0.110698 -   0.2s
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.067874 -   0.3s
[CV]  C=0.1, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.210089 -   0.2s
[CV] C=0.1, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ................
[CV]  C=0.1, kernel=rbf, epsilon=0.0

[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    2.9s


[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.035391 -   0.3s
[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.167268 -   0.2s
[CV] C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV] C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=0.059532 -   0.2s
[CV] C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01, score=0.212818 -   0.1s
[CV] C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=0.015942 -   0.3s
[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.069884 -   0.2s
[CV]  C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01, score=0.014594 -   0.1s
[CV] C=0.1, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=0.1, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.164926 -   0.2s
[CV] C=0.1, kernel=rbf, e

[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:    7.1s


[CV]  C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01, score=0.041572 -   0.1s
[CV]  C=1.0, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=0.059340 -   0.3s
[CV]  C=1.0, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.071361 -   0.3s
[CV]  C=1.0, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.166830 -   0.2s
[CV]  C=1.0, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=-0.165268 -   0.2s
[CV] C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV] C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=1.0, kernel=sigmoid, epsilon=0.01, gamma=0.1, score=0.107573 -   0.2s
[CV] C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV] C=1.0, kernel=sigmoid, epsilon=0.1, gamma=0.01 ..................
[CV] C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV] C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01 ......................
[CV]  C=1.0, kernel=rbf, epsilon=0.1, gamma=0.01, score=0.221218 -   0.2s
[CV] C=1.0, kernel=sigmoid, epsil

[Parallel(n_jobs=10)]: Done 492 tasks      | elapsed:   15.5s


[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.137746 -   0.3s
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.095584 -   0.3s
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.116068 -   0.3s
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.169010 -   0.2s
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.062879 -   0.3s
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=-0.022479 -   0.3s
[CV]  C=100.0, kernel=rbf, epsilon=0.0001, gamma=0.1, score=0.052066 -   0.3s
[CV] C=100.0, kernel=sigmoid, epsilon=0.0001, gamma=0.1 ..............
[CV]  C=100.0, kernel=rbf, 

[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 20.4min


[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.024778 - 5.7min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.006091 - 7.1min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.037562 - 5.8min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.108675 - 6.7min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.164785 - 5.8min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] ... C=100.0, kernel=linear, epsilon=0.01, score=0.318026 - 7.9min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] .. C=100.0, kernel=linear, epsilon=0.01, score=-0.167681 - 7.2min
[CV] C=100.0, kernel=linear, epsilon=0.1 .............................
[CV] .

[Parallel(n_jobs=10)]: Done 800 out of 800 | elapsed: 30.7min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid=[{'C': array([   0.1,    1. ,   10. ,  100. ]), 'kernel': ['rbf', 'sigmoid'], 'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ]), 'gamma': array([ 0.01,  0.1 ])}, {'C': array([   0.1,    1. ,   10. ,  100. ]), 'kernel': ['linear'], 'epsilon': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ])}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [7]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

SVR(C=0.10000000000000001, cache_size=200, coef0=0.0, degree=3,
  epsilon=0.10000000000000001, gamma='auto', kernel='linear', max_iter=-1,
  shrinking=True, tol=0.001, verbose=False)
0.09944778114260006
{'C': 0.10000000000000001, 'kernel': 'linear', 'epsilon': 0.10000000000000001}


### GradientBoostingRegressor

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor(n_estimators=50, max_depth=1, loss='ls')
param_grid = {'learning_rate': [0.1, 0.05, 0.02]}
grid_search = GridSearchCV(clf, param_grid=param_grid, \
                           scoring=spearmanr_scorer, cv=5, n_jobs=5, \
                           verbose=3)
grid_search.fit(features, labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] learning_rate=0.1 ...............................................
[CV] ...................... learning_rate=0.1, score=0.189653 -   0.0s
[CV] ...................... learning_rate=0.1, score=0.144253 -   0.0s
[CV] learning_rate=0.05 ..............................................
[CV] ...................... learning_rate=0.1, score=0.245060 -   0.0s
[CV] learning_rate=0.05 ..............................................
[CV] ...................... learning_rate=0.1, score=0.170426 -   0.0s
[CV] learning_rate=0.05 ..............................................
[CV] ...................... learning_rate=0.1, score=0.248317 -   0.0s
[CV] learning_rat

[Parallel(n_jobs=5)]: Done   6 out of  15 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'learning_rate': [0.1, 0.05, 0.02]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(simple_spearman), verbose=3)

In [9]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
0.19958327910902954
{'learning_rate': 0.1}


### Train and Save Prediction

In [11]:
for emotion in ['joy','anger','sadness','fear']:
    model = svm.SVR(kernel='linear', epsilon=0.1, C=0.1)
    tweets_train = pd.read_csv('data/' + emotion + '-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    features = liwc_class.build_features(tweets_train.tweet)
    labels = tweets_train.score
    model.fit(features, labels)

    tweets_test = pd.read_csv('data/' + emotion + '-ratings-0to1.dev.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
    test_features = liwc_class.build_features(tweets_test.tweet)
    tweets_test['score'] = model.predict(test_features)
    tweets_test.to_csv('submission-svm/' + emotion + '-pred.txt', sep='\t', header=False)