In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from ggplot import *

datetime - hourly date + timestamp  

season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

temp - temperature in Celsius

atemp - "feels like" temperature in Celsius

humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals

In [2]:
PATH= './'

# Loading training data
x_train= pandas.read_csv(PATH+'train.csv', header=0)  

# Loading test data
x_test = pandas.read_csv(PATH+'test.csv', header=0)


In [3]:
# Isolate target variables in stand-alone vectores
y = x_train["count"]
registered = x_train["registered"]
casual = x_train["casual"]

# Evict the target variables from the explicative matrix
x_train = x_train.drop(["count","casual","registered"],axis=1)

In [5]:
x_train["datetime"] = pandas.to_datetime(x_train["datetime"])

In [62]:
x_test["datetime"] = pandas.to_datetime(x_test["datetime"])

### Days and hours extraction

In [6]:
x_train["hour"] = x_train["datetime"].dt.hour
x_train["day"] =  x_train["datetime"].dt.day
x_train["month"] =  x_train["datetime"].dt.month
x_train["year"] =  x_train["datetime"].dt.year

In [67]:
x_test["hour"] = x_test["datetime"].dt.hour
x_test["day"] =  x_test["datetime"].dt.day
x_test["month"] =  x_test["datetime"].dt.month
x_test["year"] =  x_test["datetime"].dt.year

In [8]:
x_train.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour,day,month,year
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,-0.007108,0.002147,0.970592,-0.004299
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,-0.000313,-0.018885,0.001795,0.01208
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,0.002767,0.008691,-0.004279,-0.001945
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.029836,-0.011182,0.011928,-0.012301
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.103132,0.015689,0.256888,0.061578
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.101361,0.012653,0.263558,0.058821
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.195082,-0.007336,0.204476,-0.078799
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.07839,0.032981,-0.150864,-0.01464
hour,-0.007108,-0.000313,0.002767,-0.029836,0.103132,0.101361,-0.195082,0.07839,1.0,0.012666,-0.008003,-0.004807
day,0.002147,-0.018885,0.008691,-0.011182,0.015689,0.012653,-0.007336,0.032981,0.012666,1.0,0.000949,0.000341


In [10]:
x_train['season']=x_train['season'].astype('category')
x_train['weather']=x_train['weather'].astype('category')

In [64]:
x_test['season']=x_test['season'].astype('category')
x_test['weather']=x_test['weather'].astype('category')

In [11]:
print("** Nom des variables : \n ")
print(x_train.columns)

print("\n ** Stats descriptives sur les variables numériques : \n ")
print(x_train.describe())

print("\n")
print("** Nombre de valeurs manquantes dans les données de training : \n ")
print(x_train.isnull().sum(axis=0))

** Nom des variables : 
 
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'hour', 'day', 'month', 'year'],
      dtype='object')

 ** Stats descriptives sur les variables numériques : 
 
            holiday    workingday         temp         atemp      humidity  \
count  10886.000000  10886.000000  10886.00000  10886.000000  10886.000000   
mean       0.028569      0.680875     20.23086     23.655084     61.886460   
std        0.166599      0.466159      7.79159      8.474601     19.245033   
min        0.000000      0.000000      0.82000      0.760000      0.000000   
25%        0.000000      0.000000     13.94000     16.665000     47.000000   
50%        0.000000      1.000000     20.50000     24.240000     62.000000   
75%        0.000000      1.000000     26.24000     31.060000     77.000000   
max        1.000000      1.000000     41.00000     45.455000    100.000000   

          windspeed          hour           

Eclatement des variables catégorielles en plusieus variables dichotomiques

In [65]:
def dim_feature(x, feature_name):
    dv =DictVectorizer()
    z=dv.fit_transform(pandas.DataFrame(x[feature_name].apply(str)).to_dict(orient='records'))
    feature_dimmed = pandas.DataFrame(z.toarray(),columns=dv.get_feature_names())
    feature_dimmed = feature_dimmed.drop(feature_dimmed.columns[-1],axis=1)
    return feature_dimmed

In [68]:
seasons_dimmed = dim_feature(x_train, 'season')
weather_dimmed = dim_feature(x_train, 'weather')
hour_dimmed = dim_feature(x_train, 'hour')
day_dimmed = dim_feature(x_train, 'day')
month_dimmed =dim_feature(x_train, 'month')
#year_dimmed =dim_feature(x_train,'year')

seasons_dimmed_test = dim_feature(x_test, 'season')
weather_dimmed_test = dim_feature(x_test, 'weather')
hour_dimmed_test = dim_feature(x_test, 'hour')
day_dimmed_test = dim_feature(x_test, 'day')
month_dimmed_test =dim_feature(x_test, 'month')
#year_dimmed_test =dim_feature(x_test,'year')

In [79]:
x_train_final = pandas.concat([x_train,seasons_dimmed,weather_dimmed,hour_dimmed,month_dimmed,day_dimmed],axis=1)

x_train_final = x_train_final.drop('weather',axis=1)
x_train_final = x_train_final.drop('season',axis=1)

In [114]:
x_test_final = pandas.concat([x_test,seasons_dimmed_test,weather_dimmed_test,hour_dimmed_test,month_dimmed_test,day_dimmed_test],axis=1)

x_test_final = x_test_final.drop('weather',axis=1)
x_test_final = x_test_final.drop('season',axis=1)

In [81]:
x_train_final.columns

Index(['datetime', 'holiday', 'workingday', 'temp', 'atemp', 'humidity',
       'windspeed', 'hour', 'day', 'month', 'year', 'season=1', 'season=2',
       'season=3', 'weather=1', 'weather=2', 'weather=3', 'hour=0', 'hour=1',
       'hour=10', 'hour=11', 'hour=12', 'hour=13', 'hour=14', 'hour=15',
       'hour=16', 'hour=17', 'hour=18', 'hour=19', 'hour=2', 'hour=20',
       'hour=21', 'hour=22', 'hour=23', 'hour=3', 'hour=4', 'hour=5', 'hour=6',
       'hour=7', 'hour=8', 'month=1', 'month=10', 'month=11', 'month=12',
       'month=2', 'month=3', 'month=4', 'month=5', 'month=6', 'month=7',
       'month=8', 'day=20.0', 'day=21.0', 'day=22.0', 'day=23.0', 'day=24.0',
       'day=25.0', 'day=26.0', 'day=27.0', 'day=28.0', 'day=29.0', 'day=30.0',
       'day=31.0'],
      dtype='object')

In [104]:
np.unique(x_train[x_train['day'].notnull()]['day'])

array([ 20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,
        31.])

In [105]:
np.unique(x_test[x_test['day'].notnull()]['day'])

array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], dtype=int64)

In [83]:
x_train_no_date = x_train_final.drop(["datetime","year","hour","day","month"], axis=1)

In [115]:
x_test_no_date = x_test_final.drop(["year","hour","day","month"], axis=1)

In [107]:
x_train_no_date = x_train_no_date.drop('day=31.0',axis=1)

In [85]:
x_test_no_date.columns

Index(['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
       'season=1', 'season=2', 'season=3', 'weather=1', 'weather=2',
       'weather=3', 'hour=0', 'hour=1', 'hour=10', 'hour=11', 'hour=12',
       'hour=13', 'hour=14', 'hour=15', 'hour=16', 'hour=17', 'hour=18',
       'hour=19', 'hour=2', 'hour=20', 'hour=21', 'hour=22', 'hour=23',
       'hour=3', 'hour=4', 'hour=5', 'hour=6', 'hour=7', 'hour=8', 'month=1',
       'month=10', 'month=11', 'month=12', 'month=2', 'month=3', 'month=4',
       'month=5', 'month=6', 'month=7', 'month=8', 'day=20', 'day=21',
       'day=22', 'day=23', 'day=24', 'day=25', 'day=26', 'day=27', 'day=28',
       'day=29', 'day=30'],
      dtype='object')

## Setting up the model 

In [108]:
X_train, X_test, count_train, count_test = train_test_split(x_train_no_date, y, test_size=0.33, random_state=1234)

In [55]:
X_train_re, X_test_re, registered_train, registered_test = train_test_split(x_train_no_date, registered, test_size=0.33, random_state=1234)

In [56]:
X_train_ca, X_test_ca, casual_train, casual_test = train_test_split(x_train_no_date, casual, test_size=0.33, random_state=1234)

Mean squared error of log($\hat{y} +1 $) and log(y + 1)

In [180]:
def compute_kaggle_score(y_hat,y):
    return mean_squared_error(pandas.Series(np.log(y_hat+1)),np.log(y+1))

### 1) Estimating count directly

In [109]:
reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True,normalize=True, scoring = mean_squared_error)
reg.fit(X_train,count_train)       

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True, gcv_mode=None,
    normalize=True,
    scoring=<function mean_squared_error at 0x00000000097892F0>,
    store_cv_values=False)

In [181]:
y_hat_train = reg.predict(X_train)
y_hat_test = reg.predict(X_test)

# Training error
print(compute_kaggle_score(abs(y_hat_train),count_train))

# Test error
print(compute_kaggle_score(abs(y_hat_test),count_test))

1.0874114636
1.18799122752


### 2) Estimating registered and casual one by one , and sum 

In [89]:
reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True,normalize=True, scoring = mean_squared_error)
reg.fit(X_train_re,registered_train)   

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True, gcv_mode=None,
    normalize=True,
    scoring=<function mean_squared_error at 0x00000000097892F0>,
    store_cv_values=False)

In [46]:
re_hat_train = reg.predict(X_train)
re_hat_test = reg.predict(X_test)

# Training error
print(mean_squared_error(re_hat_train,registered_train))

# Test error
print(mean_squared_error(re_hat_test,registered_test))

9503.9540852
9589.18491322


In [47]:
reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True,normalize=True, scoring = mean_squared_error)
reg.fit(X_train_ca,casual_train)   

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5, fit_intercept=True, gcv_mode=None,
    normalize=True,
    scoring=<function mean_squared_error at 0x00000000097892F0>,
    store_cv_values=False)

In [48]:
ca_hat_train = reg.predict(X_train)
ca_hat_test = reg.predict(X_test)

# Training error
print(mean_squared_error(ca_hat_train,casual_train))

# Test error
print(mean_squared_error(ca_hat_test,casual_test))

1040.96482729
1025.62780726


Sum of casual and registered

In [61]:
# Training error
print(mean_squared_error(ca_hat_train + re_hat_train,count_train)/max((ca_hat_train+re_hat_train - count_train))

# Test error
print(mean_squared_error(ca_hat_test + re_hat_test,count_test))

40.623940704
12967.033654


In [137]:
reg.score(X_train,abs(count_train))

0.65530951512236424

In [112]:
reg.score(X_test,count_test)

0.63440006552013695

In [147]:
y_hat_test = reg.predict(x_test_no_date.drop("datetime",axis=1))

In [148]:
y_hat_test = pandas.concat([x_test_no_date["datetime"],pandas.DataFrame(y_hat_test)],axis=1)

In [149]:
y_hat_test.to_csv('results.csv',index=False)

In [145]:
y_hat_test.columns

Index(['datetime', 0], dtype='object')