In [1060]:
import pandas as pd

### Import data

In [1061]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### Convert datetime col to datetime format and add columns for day, month, dayofweek, year, dayofyear 

In [1062]:
df['datetime'] = pd.to_datetime(df['datetime']) # convert column

df['month'] = df['datetime'].dt.month
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
df['dayofyear'] = df['datetime'].dt.dayofyear

df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,dayofweek,hour,dayofyear
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,5,0,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,5,1,1


### Train-test split



In [1063]:
X = df.drop('count', axis=1)
type(X) # feature matrix

y = pd.to_numeric(df['count'])
type(y) # series 

pandas.core.series.Series

In [1064]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # stratify=y

X_train.shape, y_train.shape



((8708, 15), (8708,))

In [1065]:
X_test.shape, y_train.shape

((2178, 15), (8708,))

### Feature Engineering Function

In [1066]:
import datetime as dt
def feature_engineer(df): # take any dataframe, no matter if test or train
        # select relevant features
        df_sub = df[['hour', 'atemp', 'temp', 'humidity', 'season', 'workingday', 'weather']] # , 'windspeed', 'weather', 'month',
        df_sub2 = df[['humidity', 'atemp', 'workingday', 'temp']]
        # one hot-encoding of season
        season_binary_df = pd.get_dummies(df_sub['season'], prefix='season') # , prefix='Pclass'
        season_binary_df = season_binary_df.drop('season_1', axis=1)
        
        # remove rows with weather cat 4 
        cond = df_sub[ df_sub['weather'] == 4].index
        # Delete these row indexes from dataFrame
        df_sub.drop(cond , inplace=True)
        
        # one hot encoding of weather cat
        weat_binary_df = pd.get_dummies(df_sub['weather'], prefix='weather_cat')
        weat_binary_df = weat_binary_df.drop('weather_cat_1', axis=1)
        # one hot encoding of hour
        hour_binary_df = pd.get_dummies(df_sub['hour'], prefix='hour')
        hour_binary_df = hour_binary_df.drop('hour_0', axis=1)
        # join with the sub_df
        df_fe = pd.DataFrame(df_sub2.join([weat_binary_df, season_binary_df, hour_binary_df], how='left'))
        
        # interaction term humidity and temperature
        df_fe['temp_hum_interact'] = df_fe['temp'] * df_fe['humidity']
        
        # interaction term working day and roushhours (6-9, 16-19)
        df_fe['workday_hour_7_interact'] = df_fe['workingday'] * hour_binary_df['hour_7'] 
        df_fe['workday_hour_8_interact'] = df_fe['workingday'] * hour_binary_df['hour_8'] 
        df_fe['workday_hour_9_interact'] = df_fe['workingday'] * hour_binary_df['hour_9']
        df_fe['workday_hour_17_interact'] = df_fe['workingday'] * hour_binary_df['hour_17']  
        df_fe['workday_hour_18_interact'] = df_fe['workingday'] * hour_binary_df['hour_18']
        df_fe['workday_hour_19_interact'] = df_fe['workingday'] * hour_binary_df['hour_19']
        
        # interaction term non-working day and hour of day 
        # create non-working day column
        df_sub['non_workingday'] = df_sub['workingday'].replace({0:1, 1:0})
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_11'] 
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_12'] 
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_13']
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_14']

        
        
        # remove one-hot encoded hours!!! (they confuse the model?)

        
        return df_fe

In [1067]:
X_train_fe = feature_engineer(X_train)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1068]:
X_train_fe.head()

Unnamed: 0,humidity,atemp,workingday,temp,weather_cat_2,weather_cat_3,season_2,season_3,season_4,hour_1,...,hour_22,hour_23,temp_hum_interact,workday_hour_7_interact,workday_hour_8_interact,workday_hour_9_interact,workday_hour_17_interact,workday_hour_18_interact,workday_hour_19_interact,non_workingday
3662,61,32.575,0,28.7,0.0,0.0,0,1,0,0.0,...,0.0,0.0,1750.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8648,52,38.635,1,33.62,0.0,0.0,0,1,0,0.0,...,0.0,0.0,1748.24,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7053,65,14.395,1,12.3,0.0,0.0,1,0,0,0.0,...,0.0,0.0,799.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2685,62,34.09,0,29.52,0.0,0.0,1,0,0,0.0,...,0.0,0.0,1830.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5927,60,12.88,1,10.66,0.0,0.0,0,0,0,0.0,...,0.0,0.0,639.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1069]:
X_train_fe.shape

(8708, 40)

### Polynomial Features 

In [1070]:

from sklearn.preprocessing import PolynomialFeatures
pt = PolynomialFeatures()
p_features = pt.fit_transform(X_train_fe[['temp']])
#p_features[:,2]
p_features.shape

(8708, 3)

In [1071]:
polynomial_temp_df = pd.DataFrame.from_records(p_features)


polynomial_temp_df.columns = ['t', 't2', 'temp_pol2']
polynomial_temp_df.head()

Unnamed: 0,t,t2,temp_pol2
0,1.0,28.7,823.69
1,1.0,33.62,1130.3044
2,1.0,12.3,151.29
3,1.0,29.52,871.4304
4,1.0,10.66,113.6356


In [1072]:
polynomial_temp_df.shape

(8708, 3)

In [1073]:
X_train_fe = X_train_fe.join(polynomial_temp_df['temp_pol2'], how='left')
X_train_fe.head()

Unnamed: 0,humidity,atemp,workingday,temp,weather_cat_2,weather_cat_3,season_2,season_3,season_4,hour_1,...,hour_23,temp_hum_interact,workday_hour_7_interact,workday_hour_8_interact,workday_hour_9_interact,workday_hour_17_interact,workday_hour_18_interact,workday_hour_19_interact,non_workingday,temp_pol2
3662,61,32.575,0,28.7,0.0,0.0,0,1,0,0.0,...,0.0,1750.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,732.2436
8648,52,38.635,1,33.62,0.0,0.0,0,1,0,0.0,...,0.0,1748.24,0.0,0.0,0.0,1.0,0.0,0.0,0.0,527.1616
7053,65,14.395,1,12.3,0.0,0.0,1,0,0,0.0,...,0.0,799.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,296.5284
2685,62,34.09,0,29.52,0.0,0.0,1,0,0,0.0,...,0.0,1830.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.8256
5927,60,12.88,1,10.66,0.0,0.0,0,0,0,0.0,...,0.0,639.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,454.5424


In [1074]:
X_train_fe.shape

(8708, 41)

In [1075]:
##X_train_fe = X_train_fe.reset_index()

In [1076]:
X_train_fe = X_train_fe.fillna(0)

In [1077]:
##X_train_fe.info()

### Scaling

In [1078]:
# # scaling with min max
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# scaler.fit(X_train_fe) # memorizes the min and max for each column, no y 
# X_train_fe = scaler.transform(X_train_fe) # does the actual scaling; still no y
# X_train_fe  # numpy array

In [1079]:
# scaling with standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_fe = scaler.fit_transform(X_train_fe)
X_train_fe

array([[-0.04056395,  1.06479797, -1.45718425, ..., -0.17050261,
        -0.11820738,  1.05037042],
       [-0.50931622,  1.78105416,  0.68625502, ..., -0.17050261,
        -0.11820738,  0.45175514],
       [ 0.16777039, -1.08397058,  0.68625502, ..., -0.17050261,
        -0.11820738, -0.22144172],
       ...,
       [-1.18640283,  0.88573393, -1.45718425, ..., -0.17050261,
         8.45970843, -1.08698054],
       [ 1.31360928, -0.72584249, -1.45718425, ..., -0.17050261,
        -0.11820738, -1.08698054],
       [-1.65515511,  1.42292606, -1.45718425, ..., -0.17050261,
        -0.11820738, -0.30191132]])

In [1080]:
X_train_fe.shape

(8708, 41)

### Train model: Linear regression with scikit-learn

In [1081]:
from sklearn.linear_model import LinearRegression

### Fit the model

In [1082]:
# Create the model 
m = LinearRegression(normalize=True)

In [1083]:
m.fit(X_train_fe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [1084]:
m.coef_

array([ 22.81427673,  23.87516084, -23.88896946,  70.93666582,
        -4.74847763, -18.92142507,  15.88984851,   8.34657627,
        23.05321844,  -3.55944379,  -6.28669468,  -7.87973527,
        -8.06143358,  -4.82137906,   6.20922913,  -6.3746966 ,
         5.24046063,  16.68524336,  21.12921677,  25.32856058,
        32.93354785,  32.22033022,  19.01053649,  28.8579116 ,
        41.49940443,  41.66389305,  35.52508618,  26.87042826,
        31.39339404,  20.99050626,  13.51473585,   6.49506878,
       -59.51069814,  48.4339927 ,  68.43328073,  19.23353305,
        40.73899926,  40.77523166,  24.11443658,  16.39788183,
       -16.04559556])

In [1085]:
# Coefficients
w_0 = m.intercept_
w_1 = m.coef_[0]

In [1086]:
# Interpretation of w_0
w_0, w_1

(190.85553514010107, 22.8142767280316)

Evaluate/Optimize the model

    What kind of evaluation metrics can we use?
        MSE
        RMSLE (root mean squared logarithmic error)
        R-squared (coefficient of determination)
    You should do cross-validation (on your own)



In [1087]:
# Look at the training score
# Return the coefficient of determination R^2 of the prediction.b
m.score(X_train_fe, y_train) # 
# R-squared 0.6357555992294879
# incl weather cat one hot encoded 0.6389685222446879
# incl interaction working day and hour_18  0.6498956851149124
# incl interaction working day and hour_17  0.6615055908463631
# incl interaction working day and hour_19  0.6649612525052413
# incl interaction working day and hour_7   0.6850735381594927
# incl interaction working day and hour_8   0.7295004828181124
# incl interaction working day and hour_9   0.7333398004517597
# incl interaction non working day and hour_11   0.7366122513219515
# incl interaction non working day and hour_12   0.7376443860278101
# incl interaction non working day and hour_13   0.7378722116507113
# incl interaction non working day and hour_14   0.7387041310756237
# incl temperature polynomial, degrees of freedom = 2    0.7457480384640465

0.7457480384640465

### Optimize/ Cross Validation

In [1049]:
from sklearn.model_selection import cross_val_score

In [1050]:
cross_val_result_m_negMSR = cross_val_score(m, X_train_fe, y_train, cv=10, scoring='neg_mean_squared_error')
cross_val_result_m_negMSR

array([-8534.04642425, -8393.23306587, -7516.15858187, -9137.31412966,
       -8266.92338299, -8778.02321695, -8774.92128324, -8589.33846627,
       -8280.91310887, -7714.31615861])

In [1051]:
cross_val_result_m_r2 = cross_val_score(m, X_train_fe, y_train, cv=10, scoring='r2')
cross_val_result_m_r2

array([0.72159652, 0.73480687, 0.78264763, 0.73717205, 0.73005189,
       0.72969701, 0.7441481 , 0.7452568 , 0.72687394, 0.77229079])

In [1052]:
print('cross_val_result_m_negMSR mean',
cross_val_result_m_negMSR.mean(), 
      '\ncross_val_result_m_r2 mean',
      cross_val_result_m_r2.mean())   # no overfitting !

cross_val_result_m_negMSR mean -8398.518781857196 
cross_val_result_m_r2 mean 0.7424541598339234


### Predictions

In [1053]:
# Make predictions for the training data
y_pred_train = m.predict(X_train_fe)
y_pred_train

array([303.98265314, 596.53091049,  56.8683156 , ..., 466.47592676,
       188.3943459 , 428.59404305])

### Calculate Test Score

In [1054]:
X_test_fe = feature_engineer(X_test)
pt = PolynomialFeatures(degree=2)
p_features = pt.fit_transform(X_test_fe[['temp']])
polynomial_temp_df = pd.DataFrame.from_records(p_features)


polynomial_temp_df.columns = ['t', 't2', 'temp_pol2']
X_test_fe = X_test_fe.join(polynomial_temp_df['temp_pol2'], how='left')
X_test_fe = X_test_fe.fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1055]:
# scaling with standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_test_fe = scaler.fit_transform(X_test_fe)
X_test_fe

array([[-0.32537252,  0.38316806,  0.67807487, ..., -0.17539085,
        -0.10775762, -0.38786434],
       [ 1.37442573,  0.6496345 ,  0.67807487, ..., -0.17539085,
        -0.10775762, -0.38786434],
       [-0.37688156, -0.15093869,  0.67807487, ..., -0.17539085,
        -0.10775762, -0.38786434],
       ...,
       [-1.6646075 ,  1.53883419,  0.67807487, ..., -0.17539085,
        -0.10775762, -0.38786434],
       [-0.42839059, -0.32877863,  0.67807487, ..., -0.17539085,
        -0.10775762, -0.38786434],
       [ 0.6532992 , -0.95092495, -1.47476341, ..., -0.17539085,
        -0.10775762, -0.38786434]])

In [1056]:
X_test_fe.shape

(2178, 41)

In [1057]:
m.score(X_test_fe, y_test)

0.746034543025276

### Kaggle submission

Kaggle evaluates the results of all submissions based on the Root Mean Squared Log Error (RMSLE).

The purpose of this metric is to treat the error in relation to the bike count. If the amount of bikes is 100, an error of 10 bikes does not matter that much, but if the predicted value is only 10, the same error is a lot. The logarithm fixes that.

To optimize your model against the RMSLE, you should take the logarithm of the target colum (y). Because 0 is a valid target value, use the log of y+1
instead:

In [1059]:
import numpy as np
ylog = np.log1p(y_test)
ylog

6638     5.501258
7975     5.480639
5915     5.438079
8050     6.148468
5894     5.817111
           ...   
10833    5.831882
5519     1.609438
8217     6.442540
5916     6.111467
4630     4.330733
Name: count, Length: 2178, dtype: float64

In [407]:
from sklearn.metrics import mean_squared_log_error

np.sqrt(mean_squared_log_error(y_pred))

TypeError: mean_squared_log_error() missing 1 required positional argument: 'y_pred'