In [76]:
import pandas as pd
from sklearn import svm
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split, cross_val_score
from scipy.stats import spearmanr
import numpy as np
from time import time

tweets = pd.read_csv('data/anger-ratings-0to1.train.txt', sep='\t', names=['id','tweet','emotion','score'], index_col='id')

In [77]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

In [78]:
labels = tweets.score
features = TfidfVectorizer().fit_transform(tweets.tweet)
model = svm.SVR()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=1)
model.fit(X_train, y_train)
scores = cross_validation.cross_val_score(model, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.24269246,  0.34029842,  0.18334481,  0.2366946 ,  0.40252897])

In [79]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=1)
parameters = {'kernel':('linear','rbf')}

svr = svm.SVR()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
scores = cross_validation.cross_val_score(clf, X_test, y_test, cv=5, scoring=spearmanr_scorer)
scores

array([ 0.31115404,  0.17925061,  0.30383709,  0.21488841,  0.57304755])

In [80]:
param_grid = [
    {'epsilon':[10**i for i in range(-4,-1)]},
]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.098570 -   0.1s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.226878 -   0.1s
[CV] ......................... epsilon=0.0001, score=0.270985 -   0.1s
[CV] ......................... epsilon=0.0001, score=0.160148 -   0.1s
[CV] epsilon=0.001 ...................................................
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.049899 -   0.1s
[CV] epsilon=0.00

[Parallel(n_jobs=5)]: Done   6 out of  15 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.3s finished


GridSearchCV took 0.44 seconds for 3 candidates


[mean: 0.16124, std: 0.08084, params: {'epsilon': 0.0001},
 mean: 0.16112, std: 0.08182, params: {'epsilon': 0.001},
 mean: 0.15756, std: 0.09378, params: {'epsilon': 0.01}]