In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


## Linear Regression
In this notebook we create a linear prediction model.

In [3]:
trips_df = pd.read_pickle('../00_data/trips_hourly_selected.pkl')


First we create a dataframe encompassing our selected paramenters.

In [4]:
MulReg = pd.DataFrame()
MulReg["demand"]=trips_df['starting_trips']
MulReg["min_temp"]=trips_df['min_temp']
MulReg["available_bikes"]=trips_df["available_bikes"]
MulReg["hour"]=trips_df["hour"]
MulReg["month"]=trips_df["month"]
MulReg["day_of_week"]=trips_df["day_of_week"]
MulReg["is_holiday"]=trips_df["is_holiday"]
MulReg["precip"]=trips_df["precip"]
MulReg.info()



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8423 entries, 2019-01-01 01:00:00 to 2019-12-31 23:00:00
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   demand           8423 non-null   float64
 1   min_temp         8423 non-null   float64
 2   available_bikes  8423 non-null   float64
 3   hour             8423 non-null   int64  
 4   month            8423 non-null   int64  
 5   day_of_week      8423 non-null   int64  
 6   is_holiday       8423 non-null   bool   
 7   precip           8423 non-null   float64
dtypes: bool(1), float64(4), int64(3)
memory usage: 534.7 KB


We split the dataframe in X and y and standardize the parameters in X. We also standardize the data thus ensuring they have the same mean (0) and standard deviation (1).

In [5]:
X = MulReg[["min_temp","available_bikes","hour","month","day_of_week","is_holiday","precip"]]
X_std = StandardScaler().fit_transform(X)
y = MulReg["demand"]

We make a train test split with a testsize of 30%.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=0)




## Training the model

In [7]:
lin_mod_mul =LinearRegression()
lin_mod_mul.fit(X_train, y_train)
y_pred = lin_mod_mul.predict(X_test)
y_true_2 = y_test

Here we print the coefficients for the model.

In [8]:
print("The Coefficients for our multiple linear regression model are:", "\n" "\n"
      "min_temp        =   ", lin_mod_mul.coef_[0], "\n"
      "available_bikes =  ", lin_mod_mul.coef_[1],"\n"
      "hour            =   ", lin_mod_mul.coef_[2],"\n"
      "month           =   ", lin_mod_mul.coef_[3],"\n"
      "day_of_week     =   ", lin_mod_mul.coef_[4],"\n"
      "is_holiday      =   ", lin_mod_mul.coef_[5],"\n"
      "precipitation     =   ", lin_mod_mul.coef_[6],"\n\n"
      "The Intercept is:", lin_mod_mul.intercept_)

The Coefficients for our multiple linear regression model are: 

min_temp        =    -4.815410034202011 
available_bikes =   4.634371928223787 
hour            =    9.248678509811587 
month           =    2.9486548241571824 
day_of_week     =    -2.688327101805283 
is_holiday      =    -1.7628180775078734 
precipitation     =    -1.6429164375336256 

The Intercept is: 26.78537086877164


## Evaluating the testmetrics

In [9]:


print("MAE:",mean_absolute_error(y_true_2, y_pred))
print("MSE:",mean_squared_error(y_true_2, y_pred))
print("R^2:",r2_score(y_true_2, y_pred))



MAE: 14.697835243673717
MSE: 356.211035668143
R^2: 0.2257583094042228
