In [1]:
import pandas as pd
import matplotlib as pl
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier  
from sklearn.tree import DecisionTreeRegressor 
from sklearn.svm import LinearSVR
from sklearn import svm
import numpy as np


In [2]:
data = pd.read_csv("bikes_train.csv")

In [3]:
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["dayofweek"] = data["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
data["year"] = data["datetime"].dt.year
data["month"] = data["datetime"].dt.month
data["day"] = data["datetime"].dt.day
data["hour"] = data["datetime"].dt.hour

In [5]:
y_casual = data["casual"]
y_registered = data["casual"]
features = ["season","holiday","workingday","weather","temp","atemp","humidity","windspeed","dayofweek","year","month","day","hour"]
X = data[features]

In [6]:
X.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,0.0,5,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,0.0,5,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,0.0,5,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,0.0,5,2011,1,1,4


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_casual+y_registered, test_size=0.2, random_state=42)

In [None]:
Cvals = list(map(lambda x: 2**x,range(-8,11)))
Evals = list(map(lambda x: 2**x,range(-12,5)))
SVRparameters = {"C":Cvals, "loss":["epsilon_insensitive","squared_epsilon_insensitive" ], "epsilon":Evals}
SVRegressor = LinearSVR()
SVRegressor = GridSearchCV(estimator = SVRegressor, param_grid = SVRparameters, return_train_score=True, n_jobs=4)
SVRegressor.fit(X = X_train, y = y_train)
SVRegressor.best_estimator_

In [None]:
def rmsle(y_pred, y_true) :
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_true))**2))

In [None]:
y_pred = SVRegressor.predict(X_test)
rmsle(y_pred, y_test)

In [None]:
plt.scatter(y_test, y_pred)

In [None]:
pd.DataFrame(y_pred).to_csv(path_or_buf="y_pred.csv")

In [None]:
nfeat = len(X_train)
GVals = list(map(lambda x: 1/x,range(int(nfeat/2),2*nfeat)))
SVRparameters = {"C":Cvals, "epsilon":Evals,"gamma":GVals }
SVRegressor = GridSearchCV(estimator = svm.SVR(), param_grid = SVRparameters, return_train_score=True, n_jobs=4)
SVRegressor.fit(X = X_train, y = y_train)
SVRegressor.best_estimator_

In [None]:
y_pred = SVRegressor.predict(X_test)
rmsle(y_pred, y_test)

In [None]:
plt.scatter(y_test, y_pred)

In [None]:
pd.DataFrame(y_pred).to_csv(path_or_buf="y_pred.csv")