In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_friedman1
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler

## Data Wrangling
#### Load and examine data set

In [158]:
df = pd.read_csv("FloridaBikeRentals.csv")
print(df.head(10))
print(df.shape)
print(df.describe())
print(df.info())

         Date  Rented Bike Count  Hour  Temperature(C)  Humidity(%)  \
0  01/12/2017                254     0            -5.2           37   
1  01/12/2017                204     1            -5.5           38   
2  01/12/2017                173     2            -6.0           39   
3  01/12/2017                107     3            -6.2           40   
4  01/12/2017                 78     4            -6.0           36   
5  01/12/2017                100     5            -6.4           37   
6  01/12/2017                181     6            -6.6           35   
7  01/12/2017                460     7            -7.4           38   
8  01/12/2017                930     8            -7.6           37   
9  01/12/2017                490     9            -6.5           27   

   Wind speed (m/s)  Visibility (10m)  Dew point temperature(C)  \
0               2.2              2000                     -17.6   
1               0.8              2000                     -17.6   
2               1

#### Print out information on fields data type and ranges.

In [159]:
for col in df.columns:
    data_type = df[col].dtype
    if data_type == object:
        data_values = df[col].unique()
        print(f'Feature Name: {col} Data Type: {data_type} Values: {data_values[:20]}')
    else:
        max_data = df[col].min()
        min_data = df[col].max()
        print(f'Feature Name: {col} Data Type: {data_type} Min: {min_data} Max: {max_data}')


Feature Name: Date Data Type: object Values: ['01/12/2017' '02/12/2017' '03/12/2017' '04/12/2017' '05/12/2017'
 '06/12/2017' '07/12/2017' '08/12/2017' '09/12/2017' '10/12/2017'
 '11/12/2017' '12/12/2017' '13/12/2017' '14/12/2017' '15/12/2017'
 '16/12/2017' '17/12/2017' '18/12/2017' '19/12/2017' '20/12/2017']
Feature Name: Rented Bike Count Data Type: int64 Min: 3556 Max: 0
Feature Name: Hour Data Type: int64 Min: 23 Max: 0
Feature Name: Temperature(C) Data Type: float64 Min: 39.4 Max: -17.8
Feature Name: Humidity(%) Data Type: int64 Min: 98 Max: 0
Feature Name: Wind speed (m/s) Data Type: float64 Min: 7.4 Max: 0.0
Feature Name: Visibility (10m) Data Type: int64 Min: 2000 Max: 27
Feature Name: Dew point temperature(C) Data Type: float64 Min: 27.2 Max: -30.6
Feature Name: Solar Radiation (MJ/m2) Data Type: float64 Min: 3.52 Max: 0.0
Feature Name: Rainfall(mm) Data Type: float64 Min: 35.0 Max: 0.0
Feature Name: Snowfall (cm) Data Type: float64 Min: 8.8 Max: 0.0
Feature Name: Seasons Data 

 #### Update Numerical Data Types

In [160]:
for col in df.columns:
    if df[col].dtype == "int64":
        if (df[col].max() <= np.iinfo(np.int8).max) and (df[col].min() >= np.iinfo(np.int8).min):
            df[col] = df[col].astype("int8")
        elif (df[col].max() <= np.iinfo(np.int16).max) and (df[col].min() >= np.iinfo(np.int16).min):
            df[col] = df[col].astype("int16")
    elif df[col].dtype == "float64":
        if (df[col].max() <= np.finfo(np.float32).max) and (df[col].min() >= np.finfo(np.float32).min):
            df[col] = df[col].astype("float32")
    
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      8760 non-null   object 
 1   Rented Bike Count         8760 non-null   int16  
 2   Hour                      8760 non-null   int8   
 3   Temperature(C)            8760 non-null   float32
 4   Humidity(%)               8760 non-null   int8   
 5   Wind speed (m/s)          8760 non-null   float32
 6   Visibility (10m)          8760 non-null   int16  
 7   Dew point temperature(C)  8760 non-null   float32
 8   Solar Radiation (MJ/m2)   8760 non-null   float32
 9   Rainfall(mm)              8760 non-null   float32
 10  Snowfall (cm)             8760 non-null   float32
 11  Seasons                   8760 non-null   object 
 12  Holiday                   8760 non-null   object 
 13  Functioning Day           8760 non-null   object 
dtypes: float

#### Create Date Time Field

In [161]:
df["DateTime"] = df["Date"] + " " + df["Hour"].astype("str").str.zfill(2)
df["DateTime"] = pd.to_datetime(df["DateTime"],format='%d/%m/%Y %H')
df.drop(columns=["Date","Hour"],inplace=True)
df["Year"] = df["DateTime"].dt.year.astype("int16")
df["Month"] = df["DateTime"].dt.month.astype("int16")
df["Day"] = df["DateTime"].dt.day.astype("int16")
df["Hour"] = df["DateTime"].dt.hour.astype("int16")


print(df.head())

   Rented Bike Count  Temperature(C)  Humidity(%)  Wind speed (m/s)  \
0                254            -5.2           37               2.2   
1                204            -5.5           38               0.8   
2                173            -6.0           39               1.0   
3                107            -6.2           40               0.9   
4                 78            -6.0           36               2.3   

   Visibility (10m)  Dew point temperature(C)  Solar Radiation (MJ/m2)  \
0              2000                -17.600000                      0.0   
1              2000                -17.600000                      0.0   
2              2000                -17.700001                      0.0   
3              2000                -17.600000                      0.0   
4              2000                -18.600000                      0.0   

   Rainfall(mm)  Snowfall (cm) Seasons     Holiday Functioning Day  \
0           0.0            0.0  Winter  No Holiday        

#### Encode Nominal data

In [162]:

encoder = OneHotEncoder(sparse_output=False,handle_unknown="ignore").set_output(transform='pandas')
encode_city = encoder.fit_transform(df[['Seasons']])
df = pd.concat([df, encode_city],axis=1).drop(columns=["Seasons"])
print(df.head())

   Rented Bike Count  Temperature(C)  Humidity(%)  Wind speed (m/s)  \
0                254            -5.2           37               2.2   
1                204            -5.5           38               0.8   
2                173            -6.0           39               1.0   
3                107            -6.2           40               0.9   
4                 78            -6.0           36               2.3   

   Visibility (10m)  Dew point temperature(C)  Solar Radiation (MJ/m2)  \
0              2000                -17.600000                      0.0   
1              2000                -17.600000                      0.0   
2              2000                -17.700001                      0.0   
3              2000                -17.600000                      0.0   
4              2000                -18.600000                      0.0   

   Rainfall(mm)  Snowfall (cm)     Holiday Functioning Day  \
0           0.0            0.0  No Holiday             Yes   
1   

#### Mapping Binary fields to boolean

In [163]:
holiday_map = {"No Holiday":False,"Holiday":True}
function_map = {"Yes":True,"No":False}
df["Holiday"] = df["Holiday"].map(holiday_map).astype("bool")
df["Functioning Day"] = df["Functioning Day"].map(function_map).astype("bool")
print(df.head())
print(df.info())

   Rented Bike Count  Temperature(C)  Humidity(%)  Wind speed (m/s)  \
0                254            -5.2           37               2.2   
1                204            -5.5           38               0.8   
2                173            -6.0           39               1.0   
3                107            -6.2           40               0.9   
4                 78            -6.0           36               2.3   

   Visibility (10m)  Dew point temperature(C)  Solar Radiation (MJ/m2)  \
0              2000                -17.600000                      0.0   
1              2000                -17.600000                      0.0   
2              2000                -17.700001                      0.0   
3              2000                -17.600000                      0.0   
4              2000                -18.600000                      0.0   

   Rainfall(mm)  Snowfall (cm)  Holiday  Functioning Day            DateTime  \
0           0.0            0.0    False         

In [164]:
drop_columns = ["Rented Bike Count","Holiday","Functioning Day","DateTime","Seasons_Autumn","Seasons_Spring","Seasons_Summer","Seasons_Winter"]

scale_columns = [item for item in df.columns if item not in drop_columns]

In [165]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop(columns=drop_columns))
scaled_data = pd.DataFrame(scaled_data,columns=scale_columns)
print(scaled_data)

      Temperature(C)  Humidity(%)  Wind speed (m/s)  Visibility (10m)  \
0          -1.513957    -1.042483          0.458476          0.925871   
1          -1.539074    -0.993370         -0.892561          0.925871   
2          -1.580936    -0.944257         -0.699556          0.925871   
3          -1.597680    -0.895144         -0.796059          0.925871   
4          -1.580936    -1.091596          0.554978          0.925871   
...              ...          ...               ...               ...   
8755       -0.726961    -1.189822          0.844486          0.751605   
8756       -0.793939    -1.042483          0.554978          0.925871   
8757       -0.860918    -0.944257         -1.375075          0.873263   
8758       -0.902779    -0.846031         -0.699556          0.694064   
8759       -0.919524    -0.747805         -0.410048          0.776265   

      Dew point temperature(C)  Solar Radiation (MJ/m2)  Rainfall(mm)  \
0                    -1.659605                -0.6

In [166]:
for col in scale_columns:
    df[col] = scaled_data[col]

print(df.head())

   Rented Bike Count  Temperature(C)  Humidity(%)  Wind speed (m/s)  \
0                254       -1.513957    -1.042483          0.458476   
1                204       -1.539074    -0.993370         -0.892561   
2                173       -1.580936    -0.944257         -0.699556   
3                107       -1.597680    -0.895144         -0.796059   
4                 78       -1.580936    -1.091596          0.554978   

   Visibility (10m)  Dew point temperature(C)  Solar Radiation (MJ/m2)  \
0          0.925871                 -1.659605                -0.655132   
1          0.925871                 -1.659605                -0.655132   
2          0.925871                 -1.667262                -0.655132   
3          0.925871                 -1.659605                -0.655132   
4          0.925871                 -1.736177                -0.655132   

   Rainfall(mm)  Snowfall (cm)  Holiday  Functioning Day            DateTime  \
0       -0.1318      -0.171891    False         

In [167]:
df.to_csv("bike_rental_features.csv",index=False)

## Model Training

Data Split

In [168]:
train_data, test_data = train_test_split(df,train_size=0.8,random_state=315)

train_data.info

<bound method DataFrame.info of       Rented Bike Count  Temperature(C)  Humidity(%)  Wind speed (m/s)  \
4085                160       -0.065549    -0.551354         -0.120540   
5639               1157        1.433094    -0.011112         -0.120540   
6062                658        1.860081    -0.698693         -0.120540   
7880               2070       -0.417185     0.872920         -0.506551   
1997                 75       -1.279533     0.038001         -0.603053   
...                 ...             ...          ...               ...   
5011               2487        1.006106     0.578243          0.072465   
6915                302        0.729820     1.069372         -0.506551   
1770                475       -1.446979    -0.600467          1.713010   
2242                239       -0.258112     1.560501         -0.892561   
1591                227       -2.049785    -0.698693         -0.120540   

      Visibility (10m)  Dew point temperature(C)  Solar Radiation (MJ/m2)  \
40

#### Linear Testing

In [169]:
linearModel = LinearRegression()

linearModel.fit(train_data.drop(columns=["Rented Bike Count","DateTime"]),train_data["Rented Bike Count"])

linear_train_pred = linearModel.predict(train_data.drop(columns=["Rented Bike Count","DateTime"]))
linear_test_pred = linearModel.predict(test_data.drop(columns=["Rented Bike Count","DateTime"]))

train_rmse_lin = root_mean_squared_error(train_data["Rented Bike Count"], linear_train_pred)
test_rmse_lin = root_mean_squared_error(test_data["Rented Bike Count"], linear_test_pred)

print(f"Training RMSE: {train_rmse_lin:.4f}")
print(f"Testing RMSE:  {test_rmse_lin:.4f}")

Training RMSE: 425.9319
Testing RMSE:  449.7987


Ridge Model

In [170]:
ridgeModel = Ridge()

ridgeModel.fit(train_data.drop(columns=["Rented Bike Count","DateTime"]),train_data["Rented Bike Count"])

ridge_train_pred = ridgeModel.predict(train_data.drop(columns=["Rented Bike Count","DateTime"]))
ridge_test_pred = ridgeModel.predict(test_data.drop(columns=["Rented Bike Count","DateTime"]))

train_rmse_ridg = root_mean_squared_error(train_data["Rented Bike Count"], ridge_train_pred)
test_rmse_ridg = root_mean_squared_error(test_data["Rented Bike Count"], ridge_test_pred)

print(f"Training RMSE: {train_rmse_ridg:.4f}")
print(f"Testing RMSE:  {test_rmse_ridg:.4f}")

Training RMSE: 425.9336
Testing RMSE:  449.7915


Lasso Model

In [171]:
lassoModel = Lasso()

lassoModel.fit(train_data.drop(columns=["Rented Bike Count","DateTime"]),train_data["Rented Bike Count"])

lasso_train_pred = lassoModel.predict(train_data.drop(columns=["Rented Bike Count","DateTime"]))
lasso_test_pred = lassoModel.predict(test_data.drop(columns=["Rented Bike Count","DateTime"]))

train_rmse_lasso = root_mean_squared_error(train_data["Rented Bike Count"], lasso_train_pred)
test_rmse_lasso = root_mean_squared_error(test_data["Rented Bike Count"], lasso_test_pred)

print(f"Training RMSE: {train_rmse_lasso:.4f}")
print(f"Testing RMSE:  {test_rmse_lasso:.4f}")

Training RMSE: 426.7903
Testing RMSE:  450.0218


Elastic Net Regression

In [172]:
elasticModel = ElasticNet()

elasticModel.fit(train_data.drop(columns=["Rented Bike Count","DateTime"]),train_data["Rented Bike Count"])

elastic_train_pred = elasticModel.predict(train_data.drop(columns=["Rented Bike Count","DateTime"]))
elastic_test_pred = elasticModel.predict(test_data.drop(columns=["Rented Bike Count","DateTime"]))

train_rmse_elast = root_mean_squared_error(train_data["Rented Bike Count"], elastic_train_pred)
test_rmse_elast = root_mean_squared_error(test_data["Rented Bike Count"], elastic_test_pred)

print(f"Training RMSE: {train_rmse_elast:.4f}")
print(f"Testing RMSE:  {test_rmse_elast:.4f}")

Training RMSE: 474.1210
Testing RMSE:  496.8896
