In [1]:
%matplotlib inline

import math
import pytz 
import traceback
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle

In [2]:
%run src/data/helper.py

In [3]:
start_time = time.time()

with open('data/parsed/stations_dataset_final.p', 'rb') as f:
    stations = pickle.load(f)

with open('data/parsed/readings_clean.p', "rb") as f:
    readings = pickle.load(f)

end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)

Opening redistribution data took 217.062959909


## Split Dataset

In [4]:
split_training = lambda df: df[datetime(2016,5,15,0,0,0,0):datetime(2016,6,12,23,59,59,999999)]
split_validation = lambda df: df[datetime(2016,6,13,0,0,0,0):datetime(2016,6,19,23,59,59,999999)]
split_test = lambda df: df[datetime(2016,5,20,0,0,0,0):datetime(2016,6,26,23,59,59,999999)]

In [5]:
def split_datasets(df, station_id):
    station_df = df.loc[station_id]
    training = split_training(station_df)
    validation = split_validation(station_df)
    test = split_test(station_df)
    
    return training, validation, test

## Model Definitions

In [6]:
import sys

def clip_and_round(arr):
    arr = np.clip(arr, 0, sys.maxint)
    return np.round(arr)

In [7]:
last_data = None

In [21]:
import inspect

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error

from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import IntVector, Formula

pandas2ri.activate()

r = robjects.r
base = importr('base')
stats = importr('stats')
mgcv = importr('mgcv')

class GAMRegressor(BaseEstimator, RegressorMixin):      
    last_data = None

    def __init__(self, features=None, formula_str=None,
                 FogTMinus2_parametric=None, RainTMinus2_parametric=None,
                 DistNbBikes_parametric=None, CollNbBikes_parametric=None, 
                 DistNbBikesCum6_parametric=None, CollNbBikesCum6_parametric=None, 
                 TempTMinus2AndHumidityTMinus2_sp=None,
                 TempTMinus12AndHumidityTMinus12_sp=None,
                 NbBikesTMinus2_sp=None, NbBikesTMinus3_sp=None, 
                 NbBikesTMinus12_sp=None, NbBikesTMinus18_sp=None, HistAvg_sp=None, HistAvg_k=None,             
                 TimeOfDay_1_sp=None, TimeOfDay_2_sp=None, TimeOfDay_3_sp=None,
                 TimeOfDay_1_by=None, TimeOfDay_2_by=None, TimeOfDay_3_by=None):
        
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        self.args_to_set = []
        for arg, val in values.items():
            # save a list of the arguments
            if arg != 'features' and arg != 'formula_str':
                self.args_to_set.append(arg)
            setattr(self, arg, val)

    def fit(self, X, y=None): 
        if self.formula_str is None:
            features_dicts = self.build_features_dicts()
            self.formula_str = self.build_formula_str(features_dicts)       
            
        GAMRegressor.last_data=X
            
        frm = Formula(self.formula_str)
        self.gam = mgcv.gam(frm, data=X)
        
        return self

    def predict(self, X):
        assert (self.gam is not None), "GAM must be set"
        p_val = clip_and_round(stats.predict(self.gam, newdata=X))
        return p_val
    
    def score(self, X):
        p_val = self.predict(X)
        y_val = X.NbBikes
        rmse = mean_squared_error(y_val, p_val)**0.5
        return rmse * (-1)
    
    def build_features_dicts(self):
        assert (self.features is not None), "features must be set"
        
        # initialize the dictionaries
        features_dicts = {}
        for feature in self.features:
            features_dicts[feature] = {
                'name': feature,
                'bs': 'tp',
                'sp': None,
                'by': None,
                'k': None,
                'parametric': False
            }
            
        # set parameter values
        for arg in self.args_to_set:
            val = getattr(self, arg)
            if val is None:
                continue
            feature, parameter = arg.rsplit('_',1)
            features_dicts[feature][parameter] = val
            
        return features_dicts
    
    def build_formula_str(self, features_dicts):
        formula = 'NbBikes ~ '
        for feature, feature_dict in features_dicts.iteritems():
            if feature_dict['parametric']:
                formula += '%(name)s+' % feature_dict
                continue
                                
            tokens = feature_dict['name'].split('_')
            name, index = (tokens[0],None) if len(tokens) == 1 else (tokens[0], tokens[1])
            formula += "s(%s" % name.replace('And', ',')
            
            if feature_dict['bs'] is not None:
                formula += ", bs='%s'" % feature_dict['bs']
            if feature_dict['sp'] is not None:
                formula += ", sp=%f" % feature_dict['sp']
            if feature_dict['by'] is not None:
                formula += ", by=%s" % feature_dict['by']
            if feature_dict['k'] is not None:
                formula += ", k=%s" % feature_dict['k']
                
            formula += ")+" % feature_dict
        return formula[:-1]
    
class LRegressor(BaseEstimator, RegressorMixin):  
    def __init__(self, formula_str):
    	self.formula_str = formula_str

    def fit(self, X, y=None):            
        self.lr = stats.lm(Formula(self.formula_str), data=X)        
        return self

    def predict(self, X):
        assert (self.lr is not None), "LR must be set"
        p_val = clip_and_round(stats.predict(self.lr, newdata=X))
        return p_val
    
    def score(self, X):
        p_val = self.predict(X)
        y_val = X.NbBikes
        rmse = mean_squared_error(y_val, p_val)**0.5
        return rmse * (-1)   

In [9]:
def fit_and_predict_gam(training, validation, formula):
    gam = GAMRegressor(formula_str=formula)
    gam.fit(training)
    return gam, clip_and_round(gam.predict(validation))

In [10]:
def model(df, station_ids, gam_formula_reg, gam_formula_unreg, pred_col):
    results = []

    for station_id in station_ids:
        print 'Fitting %s' % station_id
            
        training, validation, test = split_datasets(df, station_id)      
        y_val = validation[pred_col]
        
        try:            
            gam_r_fit = fit_and_predict_gam(training, validation, gam_formula_reg)
            gam_r_rmse = mean_squared_error(y_val, gam_r_fit[1])**0.5
            
            gam_ur_fit = fit_and_predict_gam(training, validation, gam_formula_unreg)
            gam_ur_rmse = mean_squared_error(y_val, gam_ur_fit[1])**0.5
        except Exception as e:
            logging.error(traceback.format_exc())
        
        results.append({'Id': station_id, 'GAM-R-ERR': gam_r_rmse, 'GAM-UR-ERR': gam_ur_rmse})
        
    return results

In [11]:
def convert_results_to_df(results, name):
    dfs = [pd.DataFrame(result).set_index('Id') for result in results]
    for i,df in enumerate(dfs):
        df.columns = ['%s-EXP%d-%s' % (name, i,col) for col in df.columns]
    return pd.concat(dfs, axis=1)

## Use Samples?

In [12]:
use_samples = True

if use_samples:
    #stations_to_use = np.random.choice(readings.index.get_level_values(0).unique(), 100).tolist()
    with open("data/parsed/stations_sample.p", "rb") as f:
        stations_to_use = pickle.load(f)
else:
    stations_to_use = readings.index.get_level_values(0).unique().tolist()
    
station_id=stations_to_use[0]

# Regularization Short Term

## Broad Search

In [33]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikes', 'DistNbBikes']
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
                      'NbBikesTMinus2', 'NbBikesTMinus3']                       
pred_col_short = 'NbBikes'

feature_cols_short = numeric_cols_short + boolean_cols_short + redistribution_cols
cols_short = [pred_col_short] + feature_cols_short

# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]

# remove na
readings_short.dropna(inplace=True)

In [None]:
from sklearn.grid_search import GridSearchCV

training, validation, test = split_datasets(readings_short, station_id)
search_dataset = pd.concat([training, validation])

# features of the model
features = ['TempTMinus2AndHumidityTMinus2', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 
            'NbBikesTMinus2', 'NbBikesTMinus3', 'RainTMinus2', 'FogTMinus2', 'CollNbBikes', 'DistNbBikes']

# parameters to tune via cross validation
parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'], 
               'FogTMinus2_parametric': [True], 'RainTMinus2_parametric': [True], 
               'CollNbBikes_parametric': [True], 'DistNbBikes_parametric': [True], 
               'NbBikesTMinus2_sp': [0.2,0.6,1,5,10],
               'NbBikesTMinus3_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_1_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_2_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_3_sp': [0.2,0.6,1,5,10],
               'TempTMinus2AndHumidityTMinus2_sp': [0.2,0.6,1,5,10],
               'features': [features]}]

# tuning hyper parameters 
clf = GridSearchCV(GAMRegressor(), parameters, cv=3)
clf.fit(search_dataset)

In [None]:
print 'Best parameters set found on dev set:'
print clf.best_params_

In [None]:
print 'Grid scores on development set:'
for params, mean_score, scores in clf.grid_scores_:
    print '%0.3f (+/-%0.03f) for %r' % (mean_score, scores.std() * 2, params)

**The broad search experiment gave the following regularization parameters**

{'TempTMinus2AndHumidityTMinus2_sp': 10, 'FogTMinus2_parametric': True, 'TimeOfDay_2_sp': 10, 'features': ['TempTMinus2AndHumidityTMinus2', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus2', 'NbBikesTMinus3', 'RainTMinus2', 'FogTMinus2', 'CollNbBikes', 'DistNbBikes'], 'NbBikesTMinus3_sp': 10, 'DistNbBikes_parametric': True, 'TimeOfDay_3_sp': 0.2, 'TimeOfDay_1_sp': 0.6, 'CollNbBikes_parametric': True, 'TimeOfDay_3_by': 'Holiday', 'TimeOfDay_1_by': 'Weekday', 'RainTMinus2_parametric': True, 'TimeOfDay_2_by': 'Weekend', 'NbBikesTMinus2_sp': 5}

Which we then fine-tuned to get the following regularized formula

## Comparison

In [34]:
gam_formula_regularized = "NbBikes ~ s(TempTMinus2, HumidityTMinus2, bs='tp', sp=30.0) + s(TimeOfDay, by=Weekday, bs='tp', sp=1.1) "  
gam_formula_regularized += "+ s(TimeOfDay, by=Weekend, bs='tp', sp=50.0) + s(TimeOfDay, by=Holiday, bs='tp', sp=0.2) + s(NbBikesTMinus2, bs='tp', sp=8.0) "
gam_formula_regularized += "+ s(NbBikesTMinus3, bs='tp', sp=11.0) + RainTMinus2 + FogTMinus2 "

In [35]:
gam_formula_unregularized = "NbBikes ~ s(TempTMinus2, HumidityTMinus2, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "  
gam_formula_unregularized += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus2, bs='tp') "
gam_formula_unregularized += "+ s(NbBikesTMinus3, bs='tp') + RainTMinus2 + FogTMinus2 "

In [36]:
short_comparison = [model(readings_short, stations_to_use, gam_formula_regularized, gam_formula_unregularized, 'NbBikes')]

Fitting BikePoints_161
Fitting BikePoints_286
Fitting BikePoints_595
Fitting BikePoints_159
Fitting BikePoints_723
Fitting BikePoints_798
Fitting BikePoints_305
Fitting BikePoints_741
Fitting BikePoints_139
Fitting BikePoints_223
Fitting BikePoints_716
Fitting BikePoints_642
Fitting BikePoints_268
Fitting BikePoints_661
Fitting BikePoints_547
Fitting BikePoints_188
Fitting BikePoints_521
Fitting BikePoints_468
Fitting BikePoints_311
Fitting BikePoints_519
Fitting BikePoints_536
Fitting BikePoints_367
Fitting BikePoints_137
Fitting BikePoints_387
Fitting BikePoints_615
Fitting BikePoints_756
Fitting BikePoints_121
Fitting BikePoints_516
Fitting BikePoints_705
Fitting BikePoints_666
Fitting BikePoints_105
Fitting BikePoints_770
Fitting BikePoints_665
Fitting BikePoints_782
Fitting BikePoints_584
Fitting BikePoints_425
Fitting BikePoints_276
Fitting BikePoints_520
Fitting BikePoints_386
Fitting BikePoints_151
Fitting BikePoints_559
Fitting BikePoints_38
Fitting BikePoints_690
Fitting Bike

In [37]:
short_df = convert_results_to_df(short_comparison, 'SHORT')
short_df[['SHORT-EXP0-GAM-R-ERR', 'SHORT-EXP0-GAM-UR-ERR']].mean()

SHORT-EXP0-GAM-R-ERR     0.928878
SHORT-EXP0-GAM-UR-ERR    0.929181
dtype: float64

# Regularization Mid Term

## Broad Search

In [38]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'

feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid

# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]

# remove na
readings_mid.dropna(inplace=True)

In [13]:
from sklearn.grid_search import GridSearchCV

training, validation, test = split_datasets(readings_mid, station_id)
search_dataset = pd.concat([training, validation])

# features of the model
features = ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 
            'NbBikesTMinus12', 'NbBikesTMinus18']

# parameters to tune via cross validation
parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'],                
               'NbBikesTMinus12_sp': [0.2,0.6,1,5,10],
               'NbBikesTMinus18_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_1_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_2_sp': [0.2,0.6,1,5,10],
               'TimeOfDay_3_sp': [0.2,0.6,1,5,10],
               'TempTMinus12AndHumidityTMinus12_sp': [0.2,0.6,1,5,10],
               'features': [features]}]

# tuning hyper parameters 
clf = GridSearchCV(GAMRegressor(), parameters, cv=2)
clf.fit(search_dataset)

GridSearchCV(cv=2, error_score='raise',
       estimator=GAMRegressor(CollNbBikes_parametric=None, DistNbBikes_parametric=None,
       FogTMinus2_parametric=None, NbBikesTMinus12_sp=None,
       NbBikesTMinus18_sp=None, NbBikesTMinus2_sp=None,
       NbBikesTMinus3_sp=None, RainTMinus2_parametric=None,
       TempTMinus12AndHumidityTMinus12_sp=N..._2_sp=None,
       TimeOfDay_3_by=None, TimeOfDay_3_sp=None, features=None,
       formula_str=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'TimeOfDay_2_sp': [0.2, 0.6, 1, 5, 10], 'TimeOfDay_3_by': ['Holiday'], 'NbBikesTMinus18_sp': [0.2, 0.6, 1, 5, 10], 'TimeOfDay_1_by': ['Weekday'], 'NbBikesTMinus12_sp': [0.2, 0.6, 1, 5, 10], 'TempTMinus12AndHumidityTMinus12_sp': [0.2, 0.6, 1, 5, 10], 'TimeOfDay_3_sp': [0.2, 0.6, 1, 5, 10...dityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [14]:
print 'Best parameters set found on dev set:'
print clf.best_params_

Best parameters set found on dev set:
{'TimeOfDay_2_sp': 0.2, 'NbBikesTMinus12_sp': 10, 'TimeOfDay_3_sp': 1, 'TimeOfDay_1_sp': 10, 'TimeOfDay_3_by': 'Holiday', 'NbBikesTMinus18_sp': 10, 'TimeOfDay_1_by': 'Weekday', 'TempTMinus12AndHumidityTMinus12_sp': 10, 'TimeOfDay_2_by': 'Weekend', 'features': ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']}


In [15]:
print 'Grid scores on development set:'
for params, mean_score, scores in clf.grid_scores_:
    print '%0.3f (+/-%0.03f) for %r' % (mean_score, scores.std() * 2, params)

Grid scores on development set:
-1.843 (+/-0.279) for {'TimeOfDay_2_sp': 0.2, 'NbBikesTMinus12_sp': 0.2, 'TimeOfDay_3_sp': 0.2, 'TimeOfDay_1_sp': 0.2, 'TimeOfDay_3_by': 'Holiday', 'NbBikesTMinus18_sp': 0.2, 'TimeOfDay_1_by': 'Weekday', 'TempTMinus12AndHumidityTMinus12_sp': 0.2, 'TimeOfDay_2_by': 'Weekend', 'features': ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']}
-1.844 (+/-0.282) for {'TimeOfDay_2_sp': 0.2, 'NbBikesTMinus12_sp': 0.2, 'TimeOfDay_3_sp': 0.6, 'TimeOfDay_1_sp': 0.2, 'TimeOfDay_3_by': 'Holiday', 'NbBikesTMinus18_sp': 0.2, 'TimeOfDay_1_by': 'Weekday', 'TempTMinus12AndHumidityTMinus12_sp': 0.2, 'TimeOfDay_2_by': 'Weekend', 'features': ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']}
-1.844 (+/-0.282) for {'TimeOfDay_2_sp': 0.2, 'NbBikesTMinus12_sp': 0.2, 'TimeOfDay_3_sp': 1, 'TimeOfDay_1_sp': 0.2, 'TimeOfDay_3_by': 'Holiday', 'NbBik

## Narrow Search

In [40]:
from sklearn.grid_search import GridSearchCV

training, validation, test = split_datasets(readings_mid, station_id)
search_dataset = pd.concat([training, validation])

# features of the model
features = ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 
            'NbBikesTMinus12', 'NbBikesTMinus18']

# parameters to tune via cross validation
parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'],                
               'NbBikesTMinus12_sp': [8.0, 7],
               'NbBikesTMinus18_sp': [9.0, 10.0],
               'TimeOfDay_1_sp': [15.0, 30.0],
               'TimeOfDay_2_sp': [0.4, 0.5],
               'TimeOfDay_3_sp': [3.0, 2.0],
               'TempTMinus12AndHumidityTMinus12_sp': [10.0, 15.0],
               'features': [features]}]

# tuning hyper parameters 
clf = GridSearchCV(GAMRegressor(), parameters, cv=2)
clf.fit(search_dataset)

GridSearchCV(cv=2, error_score='raise',
       estimator=GAMRegressor(CollNbBikes_parametric=None, DistNbBikes_parametric=None,
       FogTMinus2_parametric=None, NbBikesTMinus12_sp=None,
       NbBikesTMinus18_sp=None, NbBikesTMinus2_sp=None,
       NbBikesTMinus3_sp=None, RainTMinus2_parametric=None,
       TempTMinus12AndHumidityTMinus12_sp=N..._2_sp=None,
       TimeOfDay_3_by=None, TimeOfDay_3_sp=None, features=None,
       formula_str=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'TimeOfDay_2_sp': [0.4], 'TimeOfDay_3_by': ['Holiday'], 'NbBikesTMinus18_sp': [10.0], 'TimeOfDay_1_by': ['Weekday'], 'NbBikesTMinus12_sp': [7.0], 'TempTMinus12AndHumidityTMinus12_sp': [10.0], 'TimeOfDay_3_sp': [2.0], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_1_sp': [35.0], 'features': [['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [41]:
print 'Best parameters set found on dev set:'
print clf.best_params_

Best parameters set found on dev set:
{'TimeOfDay_2_sp': 0.4, 'NbBikesTMinus12_sp': 7.0, 'TimeOfDay_3_sp': 2.0, 'TimeOfDay_1_sp': 35.0, 'TimeOfDay_3_by': 'Holiday', 'NbBikesTMinus18_sp': 10.0, 'TimeOfDay_1_by': 'Weekday', 'TempTMinus12AndHumidityTMinus12_sp': 10.0, 'TimeOfDay_2_by': 'Weekend', 'features': ['TempTMinus12AndHumidityTMinus12', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'NbBikesTMinus12', 'NbBikesTMinus18']}


## Comparison

In [39]:
gam_formula_regularized = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp', sp=10.0) + s(TimeOfDay, by=Weekday, bs='tp', sp=35.0) "  
gam_formula_regularized += "+ s(TimeOfDay, by=Weekend, bs='tp', sp=0.4) + s(TimeOfDay, by=Holiday, bs='tp', sp=2.0) + s(NbBikesTMinus12, bs='tp', sp=7.0) "
gam_formula_regularized += "+ s(NbBikesTMinus18, bs='tp', sp=10.0) "

In [40]:
gam_formula_unregularized = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "  
gam_formula_unregularized += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_unregularized += "+ s(NbBikesTMinus18, bs='tp') "

In [41]:
mid_comparison = [model(readings_mid, stations_to_use, gam_formula_regularized, gam_formula_unregularized, 'NbBikes')]

Fitting BikePoints_161
Fitting BikePoints_286
Fitting BikePoints_595
Fitting BikePoints_159
Fitting BikePoints_723
Fitting BikePoints_798
Fitting BikePoints_305
Fitting BikePoints_741
Fitting BikePoints_139
Fitting BikePoints_223
Fitting BikePoints_716
Fitting BikePoints_642
Fitting BikePoints_268
Fitting BikePoints_661
Fitting BikePoints_547
Fitting BikePoints_188
Fitting BikePoints_521
Fitting BikePoints_468
Fitting BikePoints_311
Fitting BikePoints_519
Fitting BikePoints_536
Fitting BikePoints_367
Fitting BikePoints_137
Fitting BikePoints_387
Fitting BikePoints_615
Fitting BikePoints_756
Fitting BikePoints_121
Fitting BikePoints_516
Fitting BikePoints_705
Fitting BikePoints_666
Fitting BikePoints_105
Fitting BikePoints_770
Fitting BikePoints_665
Fitting BikePoints_782
Fitting BikePoints_584
Fitting BikePoints_425
Fitting BikePoints_276
Fitting BikePoints_520
Fitting BikePoints_386
Fitting BikePoints_151
Fitting BikePoints_559
Fitting BikePoints_38
Fitting BikePoints_690
Fitting Bike

In [42]:
mid_df = convert_results_to_df(mid_comparison, 'MID')
mid_df[['MID-EXP0-GAM-R-ERR', 'MID-EXP0-GAM-UR-ERR']].mean()

MID-EXP0-GAM-R-ERR     2.318578
MID-EXP0-GAM-UR-ERR    2.289720
dtype: float64

# Long Term

In [43]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikesCum6', 'DistNbBikesCum6']
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay', 'HistAvg']
pred_col_long = 'NbBikes'

feature_cols_long = numeric_cols_long + boolean_cols_long + redistribution_cols
cols_long = [pred_col_long] + feature_cols_long

# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]

# remove na
readings_long.dropna(inplace=True)

In [26]:
from sklearn.grid_search import GridSearchCV

training, validation, test = split_datasets(readings_long, station_id)
search_dataset = pd.concat([training, validation])

# features of the model
features = ['HistAvg', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 
            'CollNbBikesCum6', 'DistNbBikesCum6', 'HistAvg']

# parameters to tune via cross validation
parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'],                
               'CollNbBikesCum6_parametric': [True], 'DistNbBikesCum6_parametric': [True], 
               'TimeOfDay_1_sp': [30.0, 35.0],
               'TimeOfDay_2_sp': [0.5, 0.7],
               'TimeOfDay_3_sp': [3.0, 4.0],
               'HistAvg_sp': [0.4, 0.5, 0.6],
               'HistAvg_k': [8],
               'features': [features]}]

# tuning hyper parameters 
clf = GridSearchCV(GAMRegressor(), parameters, cv=2)
clf.fit(search_dataset)

GridSearchCV(cv=2, error_score='raise',
       estimator=GAMRegressor(CollNbBikesCum6_parametric=None, CollNbBikes_parametric=None,
       DistNbBikesCum6_parametric=None, DistNbBikes_parametric=None,
       FogTMinus2_parametric=None, HistAvg_k=None, HistAvg_sp=None,
       NbBikesTMinus12_sp=None, NbBikesTMinus18_sp=None,
       NbBikesTMinus2..._2_sp=None,
       TimeOfDay_3_by=None, TimeOfDay_3_sp=None, features=None,
       formula_str=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'TimeOfDay_2_sp': [0.5, 0.7], 'HistAvg_sp': [0.4, 0.5, 0.6], 'features': [['HistAvg', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'CollNbBikesCum6', 'DistNbBikesCum6']], 'TimeOfDay_3_sp': [3.0, 4.0], 'CollNbBikesCum6_parametric': [True], 'TimeOfDay_3_by': ['Holiday'], 'TimeOfDay_1_by': ['Weekday'], 'DistNbBikesCum6_parametric': [True], 'TimeOfDay_1_sp': [30.0, 35.0], 'HistAvg_k': [8], 'TimeOfDay_2_by': ['Weekend']}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbos

In [27]:
print 'Best parameters set found on dev set:'
print clf.best_params_

Best parameters set found on dev set:
{'DistNbBikesCum6_parametric': True, 'TimeOfDay_2_sp': 0.7, 'HistAvg_sp': 0.4, 'features': ['HistAvg', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'CollNbBikesCum6', 'DistNbBikesCum6'], 'TimeOfDay_3_sp': 3.0, 'TimeOfDay_1_sp': 35.0, 'CollNbBikesCum6_parametric': True, 'TimeOfDay_3_by': 'Holiday', 'TimeOfDay_1_by': 'Weekday', 'HistAvg_k': 8, 'TimeOfDay_2_by': 'Weekend'}


## Comparison

In [44]:
gam_formula_regularized = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp', sp=35.0) + s(TimeOfDay, by=Weekend, bs='tp', sp=0.7) "
gam_formula_regularized += "+ s(TimeOfDay, by=Holiday, bs='tp', sp=3.0) + s(HistAvg, bs='tp', sp=8.0) "
gam_formula_regularized += "+ CollNbBikesCum6 + DistNbBikesCum6 "

In [45]:
gam_formula_unregularized = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp') + s(TimeOfDay, by=Weekend, bs='tp') "
gam_formula_unregularized += "+ s(TimeOfDay, by=Holiday, bs='tp') + s(HistAvg, bs='tp') "
gam_formula_unregularized += "+ CollNbBikesCum6 + DistNbBikesCum6 "

In [46]:
long_comparison = [model(readings_long, stations_to_use, gam_formula_regularized, gam_formula_unregularized, 'NbBikes')]

Fitting BikePoints_161
Fitting BikePoints_286
Fitting BikePoints_595
Fitting BikePoints_159
Fitting BikePoints_723
Fitting BikePoints_798
Fitting BikePoints_305
Fitting BikePoints_741
Fitting BikePoints_139
Fitting BikePoints_223
Fitting BikePoints_716
Fitting BikePoints_642
Fitting BikePoints_268
Fitting BikePoints_661
Fitting BikePoints_547
Fitting BikePoints_188
Fitting BikePoints_521
Fitting BikePoints_468
Fitting BikePoints_311
Fitting BikePoints_519
Fitting BikePoints_536
Fitting BikePoints_367
Fitting BikePoints_137
Fitting BikePoints_387
Fitting BikePoints_615
Fitting BikePoints_756
Fitting BikePoints_121
Fitting BikePoints_516
Fitting BikePoints_705
Fitting BikePoints_666
Fitting BikePoints_105
Fitting BikePoints_770
Fitting BikePoints_665
Fitting BikePoints_782
Fitting BikePoints_584
Fitting BikePoints_425
Fitting BikePoints_276
Fitting BikePoints_520
Fitting BikePoints_386
Fitting BikePoints_151
Fitting BikePoints_559
Fitting BikePoints_38
Fitting BikePoints_690
Fitting Bike

In [47]:
long_df = convert_results_to_df(long_comparison, 'LONG')
long_df[['LONG-EXP0-GAM-R-ERR', 'LONG-EXP0-GAM-UR-ERR']].mean()

LONG-EXP0-GAM-R-ERR     6.049616
LONG-EXP0-GAM-UR-ERR    6.055086
dtype: float64

In [59]:
regularization_results = short_df.merge(mid_df, how='inner', right_index=True, left_index=True)
regularization_results = regularization_results.merge(long_df, how='inner', right_index=True, left_index=True)
regularization_results.columns = ['SHORT-REG-AVGRMSE', 'SHORT-UNREG-AVGRMSE', 'MID-REG-AVGRMSE', 'MID-UNREG-AVGRMSE', 'LONG-REG-AVGRMSE', 'LONG-UNREG-AVGRMSE']
regularization_results

Unnamed: 0_level_0,SHORT-REG-AVGRMSE,SHORT-UNREG-AVGRMSE,MID-REG-AVGRMSE,MID-UNREG-AVGRMSE,LONG-REG-AVGRMSE,LONG-UNREG-AVGRMSE
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BikePoints_161,0.704999,0.705702,1.754529,1.793120,4.193249,4.144824
BikePoints_286,1.057147,1.069277,2.854525,2.703613,7.301575,7.298075
BikePoints_595,1.010854,0.999752,2.148043,2.065519,5.386592,5.387421
BikePoints_159,1.497187,1.470612,4.394147,3.932403,6.892096,6.917133
BikePoints_723,0.925820,0.910695,2.644532,2.432828,14.236401,14.195873
BikePoints_798,1.036535,1.037253,2.552038,2.677156,6.770984,6.894147
BikePoints_305,0.697572,0.707457,1.599696,1.584586,2.869171,2.866404
BikePoints_741,0.421402,0.420813,1.033420,1.106439,4.606381,4.625938
BikePoints_139,0.782370,0.799615,1.889166,1.945946,4.357932,4.353034
BikePoints_223,1.077825,1.067652,2.789578,2.463322,3.499362,3.501063


In [99]:
mean_results = regularization_results.describe().loc[['mean', 'std']].unstack().groupby(level=0).aggregate(lambda x: tuple(x)).apply(lambda x: '%.6f (%.3f)' % (x[0], x[1])).rename('AVG_RMSE').to_frame()
a = mean_results.loc[['SHORT-REG-AVGRMSE', 'MID-REG-AVGRMSE', 'LONG-REG-AVGRMSE']]
b = mean_results.loc[['SHORT-UNREG-AVGRMSE', 'MID-UNREG-AVGRMSE', 'LONG-UNREG-AVGRMSE']]

In [100]:
mean_results = pd.concat([b.AVG_RMSE.rename('Un-regularized').reset_index(drop=True), a.AVG_RMSE.rename('Regularized').reset_index(drop=True)], axis=1)
mean_results.index = ['Short-Term','Mid-Term', 'Long-Term']
mean_results['Difference'] = [0.000303, -0.028858, 0.00547]
mean_results

Unnamed: 0,Un-regularized,Regularized,Difference
Short-Term,0.929181 (0.371),0.928878 (0.370),0.000303
Mid-Term,2.289720 (0.873),2.318578 (0.893),-0.028858
Long-Term,6.055086 (2.575),6.049616 (2.579),0.00547


In [101]:
print mean_results.to_latex()

\begin{tabular}{lllr}
\toprule
{} &    Un-regularized &       Regularized &  Difference \\
\midrule
Short-Term &  0.929181 (0.371) &  0.928878 (0.370) &    0.000303 \\
Mid-Term   &  2.289720 (0.873) &  2.318578 (0.893) &   -0.028858 \\
Long-Term  &  6.055086 (2.575) &  6.049616 (2.579) &    0.005470 \\
\bottomrule
\end{tabular}

