In [11]:
import pandas as pd
from liwc import LIWC
from sklearn import svm
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import spearmanr
import numpy as np
from time import time

tweets_train = pd.read_csv('data/anger-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
tweets_test = pd.read_csv('data/anger-ratings-0to1.dev.target.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')
liwc_class = LIWC()

In [12]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

### SVM Train

In [18]:
features = liwc_class.build_features(tweets_train.tweet)
features

array([[  4.,   1.,   1., ...,   0.,   0.,   0.],
       [ 11.,   6.,   6., ...,   0.,   0.,   0.],
       [  8.,   4.,   4., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  6.,   3.,   3., ...,   0.,   0.,   0.],
       [  4.,   2.,   2., ...,   0.,   0.,   0.]])

In [14]:
labels = tweets_train.score
#features = TfidfVectorizer().fit_transform(tweets_train.tweet)
model = svm.SVR()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([-0.13835567, -0.09737484, -0.07454812,  0.22508702, -0.12807968])

### Grid Search

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
parameters = {'kernel':('linear','rbf')}

svr = svm.SVR()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.03925187, -0.01563636, -0.14625448, -0.08287429, -0.1569095 ])

### Full Grid Search

In [16]:
param_grid = [
    {'epsilon':[10**i for i in range(-4,-1)]},
]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ........................ epsilon=0.0001, score=-0.116101 -   0.0s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.022865 -   0.1s
[CV] epsilon=0.001 ...................................................
[CV] ........................ epsilon=0.0001, score=-0.085778 -   0.1s
[CV] ......................... epsilon=0.0001, score=0.058363 -   0.1s
[CV] ......................... epsilon=0.0001, score=0.067240 -   0.1s
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done   6 out of  15 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.3s finished


GridSearchCV took 0.51 seconds for 3 candidates


[mean: -0.01056, std: 0.07579, params: {'epsilon': 0.0001},
 mean: -0.01621, std: 0.08201, params: {'epsilon': 0.001},
 mean: -0.04385, std: 0.06140, params: {'epsilon': 0.01}]

### Predict Test Set

In [19]:
test_features = liwc_class.build_features(tweets_test.tweet)
tweets_test['scores'] = clf.predict(test_features)

In [20]:
tweets_test.to_csv('submission-svm/anger-pred.txt', sep='\t', header=False)