In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv('./bike_rentals.csv')
df.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [5]:
df.drop(columns='instant',axis=1,inplace=True)

In [6]:
df.nunique()

season          4
yr              2
mnth           12
holiday         2
weekday         7
workingday      2
weathersit      3
temp          499
atemp         690
hum           595
windspeed     646
cnt           696
dtype: int64

In [7]:
df_one_hot = pd.get_dummies(df,columns=['season','yr','mnth','holiday','weekday','workingday','weathersit'])

In [8]:
X = df_one_hot.drop('cnt',axis=1)
y = df_one_hot['cnt']

In [9]:
from sklearn.model_selection import train_test_split



# random state = 0 with Linear Regression

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()

In [13]:
model.fit(X_train,y_train)

In [14]:
y_ = model.predict(X_test)

In [15]:
from sklearn.metrics import mean_squared_error

In [16]:
MSE = mean_squared_error(y_test,y_)

In [17]:
RMSE = MSE**(1/2)

In [18]:
print(f'RMSE = {RMSE}')

RMSE = 773.1778226608322


# random state = 42 Linear regression

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [20]:
model = LinearRegression()
model.fit(X_train,y_train)
y_ = model.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 775.7867627294396


# random state = 35 in linear regression

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=35)

In [22]:
model = LinearRegression()
model.fit(X_train,y_train)
y_ = model.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 720238162597017.9


# random state = 9 in linear regression

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=9)

In [24]:
model = LinearRegression()
model.fit(X_train,y_train)
y_ = model.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 307007189651331.94


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [26]:
model = LinearRegression()
model.fit(X_train,y_train)
y_ = model.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 773.1778226608322


# In Decision Tree

In [27]:
from sklearn.ensemble import RandomForestRegressor 

In [28]:
dtr = RandomForestRegressor()

# random state 42

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [30]:
dtr.fit(X_train,y_train)

In [31]:
y_ = dtr.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 651.7755171118071


# random state 0

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [33]:
dtr.fit(X_train,y_train)
y_ = dtr.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 679.910211394256


# random state 9

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=9)

In [35]:
dtr.fit(X_train,y_train)
y_ = dtr.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 656.9337434945918


# random state 35

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=35)

In [37]:
dtr.fit(X_train,y_train)
y_ = dtr.predict(X_test)
MSE = mean_squared_error(y_test,y_)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 681.0946415580518


# hyper parameter tuning

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
param_grid = {
    'max_depth': [None,5,10,15],
    'min_samples_split':[2,5,10],
    'max_features':['auto','squrt','log2',None]
}

In [40]:
rfr = RandomForestRegressor()

In [46]:
grid_search = GridSearchCV(rfr,param_grid,cv=5,scoring='neg_root_mean_squared_error')
grid_search.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/home/applify/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 6

# Print best hyperparameters

In [48]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': None, 'max_features': None, 'min_samples_split': 5}


In [49]:
best_decision_tree = grid_search.best_estimator_
y_best = best_decision_tree.predict(X_test)

In [50]:
MSE = mean_squared_error(y_test,y_best)
RMSE = MSE**(1/2)
print(f'RMSE = {RMSE}')

RMSE = 684.9194148434478


With a range of 22 to 8714, a mean of 4504.35 and a standard deviation of 1937.21, an RMSE of 684.91 is very good. But it is not the best!