In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso


In [12]:
trips_hourly_df = pd.read_pickle("../00_data/trips_hourly_engineered.pkl")
trips_hourly_df.columns


Index(['starting_trips', 'ongoing_trips_prev', 'in_service_bikes_rolling',
       'available_bikes', 'max_temp', 'min_temp', 'precip', 'hour',
       'day_of_week', 'month', 'day_of_year', 'week', 'is_weekday',
       'is_holiday'],
      dtype='object')

In [13]:
trips_hourly_df.corr()


Unnamed: 0,starting_trips,ongoing_trips_prev,in_service_bikes_rolling,available_bikes,max_temp,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday
starting_trips,1.0,0.56877,0.150725,0.132862,0.045441,0.042555,-0.066599,0.390721,-0.125066,0.130403,0.131367,0.132594,0.173193,-0.064962
ongoing_trips_prev,0.56877,1.0,0.108004,0.076131,0.045124,0.044232,-0.053256,0.267799,0.077412,0.008203,0.007004,0.007268,-0.067919,-0.023646
in_service_bikes_rolling,0.150725,0.108004,1.0,0.999488,0.588799,0.58754,-0.120945,-0.010833,0.002583,0.363219,0.360245,0.380523,-0.0015,-0.062543
available_bikes,0.132862,0.076131,0.999488,1.0,0.589092,0.587857,-0.119589,-0.019486,9.8e-05,0.364032,0.361087,0.381417,0.000682,-0.061968
max_temp,0.045441,0.045124,0.588799,0.589092,1.0,0.999187,-0.159825,0.174904,0.018038,0.40263,0.402897,0.413392,-0.028779,-0.050052
min_temp,0.042555,0.044232,0.58754,0.587857,0.999187,1.0,-0.164073,0.172621,0.020035,0.402623,0.402973,0.413222,-0.031591,-0.049782
precip,-0.066599,-0.053256,-0.120945,-0.119589,-0.159825,-0.164073,1.0,0.033648,-0.011873,-0.106644,-0.109076,-0.105436,0.00825,0.079465
hour,0.390721,0.267799,-0.010833,-0.019486,0.174904,0.172621,0.033648,1.0,-0.00116,-0.009615,-0.009668,-0.009321,-5.6e-05,0.005331
day_of_week,-0.125066,0.077412,0.002583,9.8e-05,0.018038,0.020035,-0.011873,-0.00116,1.0,-0.001312,-0.000959,0.004062,-0.79016,-0.080557
month,0.130403,0.008203,0.363219,0.364032,0.40263,0.402623,-0.106644,-0.009615,-0.001312,1.0,0.996479,0.965992,-0.007277,0.014837


In [14]:
for id_1, variable in enumerate(trips_hourly_df.columns):
    for id_2, comp_variable in enumerate(trips_hourly_df.columns):
        if id_1 < id_2:
            corrcoeff = np.corrcoef(
                [
                    trips_hourly_df[variable].astype("float64"),
                    trips_hourly_df[comp_variable].astype("float64"),
                ]
            )[0][1]
            if corrcoeff > 0.7:
                print(variable, comp_variable, corrcoeff)


in_service_bikes_rolling available_bikes 0.9994877210528004
max_temp min_temp 0.9991874178519857
month day_of_year 0.9964794091054356
month week 0.9659922422048107
day_of_year week 0.9668681338578687


In [15]:
trips_hourly_df = trips_hourly_df.drop(
    columns=["in_service_bikes_rolling", "max_temp", "day_of_year", "week"]
)
trips_hourly_df.head(2)


Unnamed: 0_level_0,starting_trips,ongoing_trips_prev,available_bikes,min_temp,precip,hour,day_of_week,month,is_weekday,is_holiday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-01 01:00:00,8.0,2.0,871.0,15.6,0.0,1,1,1,True,True
2019-01-01 02:00:00,11.0,4.0,869.0,15.0,0.0,2,1,1,True,True


In [16]:
input_features = trips_hourly_df.reset_index().drop(columns=["starting_trips", "index"])
predicted_varible = trips_hourly_df["starting_trips"]


In [20]:
scaler = StandardScaler()
input_features_scaled = scaler.fit_transform(input_features.values)


In [21]:
model = Lasso(alpha=1)
model.fit(input_features_scaled, predicted_varible)
print("Coefficients ", model.coef_)


Coefficients  [ 9.97596999  0.7510709  -0.35298015 -0.          4.71250765 -0.
  1.62708272  3.35667478 -0.21490481]


The results of Lasso regression yield 0 coefficient for features precipitation, is_weekday and is_holiday. Therefore we will drop these for the predictive analytics section.

In [None]:
trips_hourly_df = trips_hourly_df.drop(columns={"precip", "day_of_week"})


In [None]:
trips_hourly_df.to_pickle("../00_data/trips_hourly_selected.pkl")
