In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from math import sqrt

In [2]:
df_0 = pd.read_csv('../Resources/Bike_weather_merge.csv')
df_0.columns

Index(['Date', 'counts_trips', 'counts_Bike', 'trip_duration',
       'counts_member_annual', 'counts_member_casual', 'W_Temp_Max',
       'W_Temp_Avg', 'W_Temp_Min', 'W_Max_wind', 'W_Avg_wind', 'W_Min_wind',
       'W_Wind_Gust', 'W_Air_Pressure', 'W_Snow_Depth', 'W_Max_humid',
       'W_Avg_humid', 'W_Min_humid', 'W_Max_Dp', 'W_Avg_dp', 'W_Min_Dp'],
      dtype='object')

In [3]:
# double check if any nan values
df_0.isnull().values.any()

False

## Approach 1: 
y = counts_trips  
X = except counts_trips

In [4]:
# split features X and output y (counts_trips)
X = df_0.drop(columns=['counts_trips','Date'])
y = df_0['counts_trips']
print(f'X.shape: {X.shape}.')
print(f'y.shape: {y.shape}.')

X.shape: (1096, 19).
y.shape: (1096,).


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(f'X_train: {X_train.shape}.')
print(f'X_test: {X_test.shape}.')
print(f'y_train: {y_train.shape}.')
print(f'y_test: {y_test.shape}.')

X_train: (822, 19).
X_test: (274, 19).
y_train: (822,).
y_test: (274,).


In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [7]:
# Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(X_train_scaled)

In [8]:
# evaluation - linear regression
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_lr.score(X_test_scaled, y_test)
print("model: Linear Regression")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')

model: Linear Regression
-----------------------------
Mean absolute error: 1.2590748125091285e-11
Mean squared error: 2.2875493674201035e-22
Root mean squared error:1.512464666503024e-11
R2 score:1.0
Overall model score: 0.9990508116437439


In [None]:
## ? LR: how to get the model function to display the features' weight?

In [9]:
# Random forest model
model_rf = RandomForestRegressor(random_state=78)
model_rf.fit(X_train_scaled, y_train)
y_pred = model_rf.predict(X_train_scaled)

In [10]:
# evaluation : Random Forest
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_rf.score(X_test_scaled, y_test)
print("model: Random Forest")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')
print ("feature_importances:",model_rf.feature_importances_)
print ("n_features:",model_rf.n_features_)
print ("n_outputs:",model_rf.n_outputs_) 


model: Random Forest
-----------------------------
Mean absolute error: 115.39380778588807
Mean squared error: 32370.338569708052
Root mean squared error:179.9175882722644
R2 score:0.9989721111553218
Overall model score: 0.9930775295904674
feature_importances: [5.90177755e-01 3.58809921e-01 1.56380136e-02 2.86878262e-02
 1.20506176e-03 1.06582562e-03 6.69166457e-04 3.94533907e-04
 4.22385059e-04 1.78078389e-04 4.24713083e-04 4.23978869e-04
 1.17643138e-05 2.14067986e-04 3.41503259e-04 3.29013061e-04
 3.10686332e-04 4.47199396e-04 2.48507100e-04]
n_features: 19
n_outputs: 1


In [11]:
features = model_rf.feature_importances_
print(features)

[5.90177755e-01 3.58809921e-01 1.56380136e-02 2.86878262e-02
 1.20506176e-03 1.06582562e-03 6.69166457e-04 3.94533907e-04
 4.22385059e-04 1.78078389e-04 4.24713083e-04 4.23978869e-04
 1.17643138e-05 2.14067986e-04 3.41503259e-04 3.29013061e-04
 3.10686332e-04 4.47199396e-04 2.48507100e-04]


### Result :  
y=trip counts, most weighted features are bike counts, durations,no much based on the weather conditions.  
Approach 1 : X 


## Approach 2
Approach 2: y = duration, X= avg_temp, avg_wind, wind_gust, air_pressure, snow, avg_humid, avg_dp

In [12]:
y2 = df_0['trip_duration']
X2 = df_0.drop(columns=['trip_duration','Date','counts_trips','counts_Bike','counts_member_annual','counts_member_casual', 'W_Temp_Max','W_Temp_Min', 'W_Max_wind','W_Min_wind', 'W_Max_humid','W_Min_humid', 'W_Max_Dp','W_Min_Dp'])
print(f'X2.shape: {X2.shape}.')
print(f'y2.shape: {y2.shape}.')

X2.shape: (1096, 7).
y2.shape: (1096,).


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=78)
print(f'X_train: {X_train.shape}.')
print(f'X_test: {X_test.shape}.')
print(f'y_train: {y_train.shape}.')
print(f'y_test: {y_test.shape}.')

X_train: (822, 7).
X_test: (274, 7).
y_train: (822,).
y_test: (274,).


In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [15]:
# Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(X_train_scaled)

In [16]:
# evaluation - linear regression
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_lr.score(X_test_scaled, y_test)
print("model: Linear Regression")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')

model: Linear Regression
-----------------------------
Mean absolute error: 2898617.805196142
Mean squared error: 19112705117582.133
Root mean squared error:4371807.991847553
R2 score:0.6616061601424754
Overall model score: 0.6097767360018245


In [17]:
# Random Forest
model_rf = RandomForestRegressor(random_state=78)
model_rf.fit(X_train_scaled, y_train)
y_pred = model_rf.predict(X_train_scaled)

In [18]:
# evaluation : Random Forest
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_rf.score(X_test_scaled, y_test)
print("model: Random Forest")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')
print ("feature_importances:",model_rf.feature_importances_)
print ("n_features:",model_rf.n_features_)
print ("n_outputs:",model_rf.n_outputs_) 


model: Random Forest
-----------------------------
Mean absolute error: 1048141.4951094891
Mean squared error: 2810649367460.835
Root mean squared error:1676499.1403101985
R2 score:0.95023695358156
Overall model score: 0.6284541501908161
feature_importances: [0.69310303 0.04746964 0.04085829 0.06743581 0.00131287 0.10377995
 0.04604042]
n_features: 7
n_outputs: 1


### Results:
Approach 2: rmse too large: variance too far away from true value. 
rf > lr

## Appraoch 3 : 
y = counts_bike:
X = avg_temp, avg_wind, avg_humid, wind_gust, avg_dp, snow, air_pressure

In [19]:
y3 = df_0['counts_Bike']
X3 = df_0.drop(columns=['Date', 'counts_trips','trip_duration','counts_Bike',
       'counts_member_annual', 'counts_member_casual', 'W_Temp_Max','W_Temp_Min', 'W_Max_wind','W_Min_wind','W_Max_humid','W_Min_humid', 'W_Max_Dp','W_Min_Dp' ])
print(f'X3.shape: {X3.shape}.')
print(f'y3.shape: {y3.shape}.')
X3.columns

X3.shape: (1096, 7).
y3.shape: (1096,).


Index(['W_Temp_Avg', 'W_Avg_wind', 'W_Wind_Gust', 'W_Air_Pressure',
       'W_Snow_Depth', 'W_Avg_humid', 'W_Avg_dp'],
      dtype='object')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X3, y3, random_state=78)
print(f'X_train: {X_train.shape}.')
print(f'X_test: {X_test.shape}.')
print(f'y_train: {y_train.shape}.')
print(f'y_test: {y_test.shape}.')

X_train: (822, 7).
X_test: (274, 7).
y_train: (822,).
y_test: (274,).


In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [22]:
# Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(X_train_scaled)

In [23]:
# evaluation - linear regression
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_lr.score(X_test_scaled, y_test)
print("model: Linear Regression")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')

model: Linear Regression
-----------------------------
Mean absolute error: 433.92907921008015
Mean squared error: 284447.2566283849
Root mean squared error:533.3359697492613
R2 score:0.758401916574934
Overall model score: 0.7501880888859891


In [25]:
# Random Forest
model_rf = RandomForestRegressor(random_state=78)
model_rf.fit(X_train_scaled, y_train)
y_pred = model_rf.predict(X_train_scaled)

In [26]:
# evaluation
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_rf.score(X_test_scaled, y_test)
print("model: Random Forest")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')
print ("feature_importances:",model_rf.feature_importances_)
print ("n_features:",model_rf.n_features_)
print ("n_outputs:",model_rf.n_outputs_) 
     

model: Random Forest
-----------------------------
Mean absolute error: 160.38246958637473
Mean squared error: 42654.64119282239
Root mean squared error:206.53000070891005
R2 score:0.963770859724504
Overall model score: 0.784806384906906
feature_importances: [0.72344709 0.04846094 0.03794404 0.05178519 0.01088794 0.08389985
 0.04357494]
n_features: 7
n_outputs: 1


### Results: 
rf > lr

## Approach 4: 
y = counts_trips  
X = X = avg_temp, avg_wind, avg_humid, wind_gust, avg_dp, snow, air_pressure

In [27]:
y4 = df_0['counts_trips']
X4 = df_0.drop(columns=['trip_duration','Date','counts_trips','counts_Bike','counts_member_annual','counts_member_casual', 'W_Temp_Max','W_Temp_Min', 'W_Max_wind','W_Min_wind', 'W_Max_humid','W_Min_humid', 'W_Max_Dp','W_Min_Dp'])
print(f'X4.shape: {X4.shape}.')
print(f'y4.shape: {y4.shape}.')
X4.columns

X4.shape: (1096, 7).
y4.shape: (1096,).


Index(['W_Temp_Avg', 'W_Avg_wind', 'W_Wind_Gust', 'W_Air_Pressure',
       'W_Snow_Depth', 'W_Avg_humid', 'W_Avg_dp'],
      dtype='object')

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=78)
print(f'X_train: {X_train.shape}.')
print(f'X_test: {X_test.shape}.')
print(f'y_train: {y_train.shape}.')
print(f'y_test: {y_test.shape}.')

X_train: (822, 7).
X_test: (274, 7).
y_train: (822,).
y_test: (274,).


In [29]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [30]:
# Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(X_train_scaled)

In [31]:
# evaluation - linear regression
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_lr.score(X_test_scaled, y_test)
print("model: Linear Regression")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')

model: Linear Regression
-----------------------------
Mean absolute error: 2898617.805196142
Mean squared error: 19112705117582.133
Root mean squared error:4371807.991847553
R2 score:0.6616061601424754
Overall model score: 0.6097767360018245


In [32]:
# Random Forest
model_rf = RandomForestRegressor(random_state=78)
model_rf.fit(X_train_scaled, y_train)
y_pred = model_rf.predict(X_train_scaled)

In [33]:
# evaluation
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_train, y_pred)
overall = model_rf.score(X_test_scaled, y_test)
print("model: Random Forest")
print("-----------------------------")
print(f'Mean absolute error: {mae}')
print(f'Mean squared error: {mse}')
print(f'Root mean squared error:{rmse}')
print(f'R2 score:{r2}')
print(f'Overall model score: {overall}')
print ("feature_importances:",model_rf.feature_importances_)
print ("n_features:",model_rf.n_features_)
print ("n_outputs:",model_rf.n_outputs_) 

model: Random Forest
-----------------------------
Mean absolute error: 1048141.4951094891
Mean squared error: 2810649367460.835
Root mean squared error:1676499.1403101985
R2 score:0.95023695358156
Overall model score: 0.6284541501908161
feature_importances: [0.69310303 0.04746964 0.04085829 0.06743581 0.00131287 0.10377995
 0.04604042]
n_features: 7
n_outputs: 1


## Results:
Approach 1: y = counts_trips, X = all: both LR and RF overfit, output mostly weighted on bike data.
Approach 2: y = trip_duration, X = 7 weather avg variables: both models have huge mae and mse, overall model score around 60%, R2 score RF better than LR.
Approach 3: y = counts_Bike, X = 7 weather avg variables: both models have significantly smaller mae and mse compared to Approach 2, model scores and R2 scores are better than Approach 2, RF better than LR
Approach 4: y = counts_trips, X = 7 weather avg variables: both models have huge mae and mse, overall model scores are around 60%, R2 score RF better than LR.
Scaled or no-scaled training and testing sets have similar outcomes.
In summary: Random Forest model performs better than Linear Regression model on this dataset.
Future: further improvement on model accuracy is needed.