In [1]:
%matplotlib inline
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)



In [3]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [4]:
dftaxi=pd.read_csv("../data/final.csv")
print dftaxi.shape

(23409, 19)


In [6]:
itrain, itest = train_test_split(xrange(dftaxi.shape[0]), train_size=0.6)
mask=np.ones(dftaxi.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [7]:
X = dftaxi.iloc[:,3:18]
y = np.log10(dftaxi['freq']+1)

In [8]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
print Xtrain.shape
Xtrain.head()

(14045, 15)


Unnamed: 0,hour,weekday,ZIPCODE,Battery Park City,Commerical,Manufacturing,Manufacturing/Residential,Park,Residential,precipitation,snow_depth,snowfall,max_temp,min_temp,avg_wind
1,12,6,10035,0.0,12.0,0.0,7.0,13.0,16.0,0.0,0.0,0.0,9.0,7.4,1.62
4,17,0,10035,0.0,12.0,0.0,7.0,13.0,16.0,0.0,0.0,0.0,9.4,8.2,1.62
5,8,4,10035,0.0,12.0,0.0,7.0,13.0,16.0,0.0,0.0,0.0,8.7,7.0,0.72
7,10,3,10035,0.0,12.0,0.0,7.0,13.0,16.0,0.0,0.0,0.0,8.0,6.9,0.396
8,18,4,10035,0.0,12.0,0.0,7.0,13.0,16.0,0.0,0.0,0.0,8.2,6.6,1.044


In [9]:
parameters = {'C':[1,3,5,7,9]}

In [10]:
svr = SVR()

In [None]:
best = cv_optimize(svr, parameters, Xtrain, ytrain,score_func='neg_mean_squared_error', verbose=3)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=1 .............................................................
[CV] ................................... C=1, score=-0.760979 -   4.5s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s


[CV] ................................... C=1, score=-0.341760 -   3.7s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.2s remaining:    0.0s


[CV] ................................... C=1, score=-1.229924 -   3.9s
[CV] C=3 .............................................................
[CV] ................................... C=3, score=-0.757792 -   5.6s
[CV] C=3 .............................................................
[CV] ................................... C=3, score=-0.341775 -   4.8s
[CV] C=3 .............................................................
[CV] ................................... C=3, score=-1.231158 -   5.7s
[CV] C=5 .............................................................
[CV] ................................... C=5, score=-0.758246 -   6.6s
[CV] C=5 .............................................................
[CV] ................................... C=5, score=-0.341844 -   5.8s
[CV] C=5 .............................................................
[CV] ................................... C=5, score=-1.231040 -   6.2s
[CV] C=7 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.5min finished


In [None]:
reg=best.fit(Xtrain, ytrain)
training_accuracy = reg.score(Xtrain, ytrain)
test_accuracy = reg.score(Xtest, ytest)
print "############# based on standard predict ################"
print "R^2 on training data: %0.4f" % (training_accuracy)
print "R^2 on test data:     %0.4f" % (test_accuracy)

In [None]:
preds = pd.DataFrame({"prediction":reg.predict(Xtest), "observation":ytest})
preds.plot(x = "prediction", y = "observation",kind = "scatter", title = "Prediction pickup numbers V.S. Observation pickup numbers")