In [None]:
#IMPORT LIBRARIES
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns
#SUPRESS WARNINGS
import warnings
warnings.filterwarnings('ignore')

In [None]:
bike = pd.read_csv("day.csv")
bike.head()

In [None]:
bike.info()

In [None]:
bike.shape

In [None]:
bike.describe()

In [None]:
#check for null values- no null values in column data
bike.isnull().sum()

In [None]:
#renaming few columns
bike.rename(columns={'yr':'year','mnth':'month','hum':'humidity'},inplace=True)

In [None]:
bike.head()

In [None]:
bike_copy = bike.copy()
bike_copy.shape

In [None]:
bike_copy.drop_duplicates(inplace=True)
bike_copy.shape

In [None]:
bike = bike.drop(['registered','instant'],axis=1)

In [None]:
bike = bike.drop('casual',axis=1)

In [None]:
bike = bike.drop('dteday',axis=1)
bike.head()

In [None]:
bike.season = bike.season.map({1:'spring',2:'summer',3:'fall',4:'winter'})

In [None]:
bike.month =bike.month.map({1:'jan',2:'feb',3:'mar',4:'apr',5:'may',6:'jun',7:'jul',8:'aug',9:'sep',10:'oct',11:'nov',12:'dec'})

In [None]:
bike.weekday = bike.weekday.map({0:'sun',1:'mon',2:'tue',3:'wed',4:'thu',5:'fri',6:'sat'})

In [None]:
bike.weathersit =bike.weathersit.map({1:'Clear',2:'Misty',3:'Light_snowrain',4:'Heavy_snowrain'})

In [None]:
bike.head()

In [None]:
bike.info()

In [None]:
#analysing/visualising the categorical columns to see how the predictor variable stands against the target variable

In [None]:
plt.figure(figsize=(20,12))
plt.subplot(2,4,1)
sns.boxplot(x='season',y='cnt',data=bike)
plt.subplot(2,4,2)
sns.boxplot(x='month',y='cnt',data=bike)
plt.subplot(2,4,3)
sns.boxplot(x='weekday',y='cnt',data=bike)
plt.subplot(2,4,4)
sns.boxplot(x='weathersit',y='cnt',data=bike)
plt.subplot(2,4,5)
sns.boxplot(x='holiday',y='cnt',data=bike)
plt.subplot(2,4,6)
sns.boxplot(x='year',y='cnt',data=bike)
plt.subplot(2,4,7)
sns.boxplot(x='workingday',y='cnt',data=bike)

In [None]:
def plot_cat_columns(column):
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    sns.barplot(x=column,y='cnt',data=bike)
    plt.subplot(1,2,2)
    sns.barplot(x=column,y='cnt',data=bike,hue='year',palette='Set1')
    plt.legend(labels=['2018','2019'])
    plt.show()
    

In [None]:
plot_cat_columns('month')

In [None]:
plot_cat_columns('holiday')

In [None]:
plot_cat_columns('season')

In [None]:
plot_cat_columns('workingday')

In [None]:
plot_cat_columns('year')

In [None]:
plot_cat_columns('weekday')

In [None]:
###STEP 2:Visualising the Data
sns.pairplot(bike,vars=['temp','atemp','humidity','windspeed','cnt'])
plt.show()

In [None]:
plt.figure(figsize=(6,6))
matrix=np.triu(bike[['temp','atemp','humidity','windspeed','cnt']].corr())
sns.heatmap(bike[['temp','atemp','humidity','windspeed','cnt']].corr(),annot=True,cmap="RdYlGn",mask=matrix)
plt.title("Relation between Numerical varables")
plt.show()


In [None]:
###STEP #: Data preparation
#dummy variable creation
months_df=pd.get_dummies(bike.month,drop_first=True)
weekdays_df=pd.get_dummies(bike.weekday,drop_first=True)
weathersit_df=pd.get_dummies(bike.weathersit,drop_first=True)
seasons_df=pd.get_dummies(bike.season,drop_first=True)

In [None]:
bike.head()

In [None]:
bike_new = pd.concat([bike,months_df,weekdays_df,weathersit_df,seasons_df],axis=1)
bike_new.head()

In [None]:
bike_new.info()

In [None]:
bike_new.drop(['season','month','weekday','weathersit'],axis=1,inplace=True)


In [None]:
bike_new.info()

In [None]:
bike_new.shape

In [None]:
##4. Splitting the data into training and testing sets
np.random.seed(0)

In [None]:
df_train,df_test = train_test_split(bike_new,train_size=0.7,random_state=100)
print(df_train.shape)
print(df_test.shape)

In [None]:
scaler = MinMaxScaler()

In [None]:
df_train.head()

In [None]:
num_vars = [ 'temp', 'atemp', 'humidity', 'windspeed','cnt']
df_train[num_vars]= scaler.fit_transform(df_train[num_vars])


In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
plt.figure(figsize =(16,10))
matrix=np.triu(df_train.corr())
sns.heatmap(df_train.corr(),annot = True,cmap="RdYlGn",mask=matrix)
plt.show()

In [None]:
plt.figure(figsize =[6,6])
plt.scatter(df_train.temp,df_train.cnt)
plt.show()

In [None]:
y_train = df_train.pop('cnt')
X_train = df_train


In [None]:
#recursive feature elimination
lm=LinearRegression()
lm.fit(X_train,y_train)
rfe=RFE(lm)
rfe=rfe.fit(X_train,y_train)

In [None]:
#List of variables selected in top 15 list

list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
# selecting the selected variable via RFE in col list

col = X_train.columns[rfe.support_]
print(col)

In [None]:
col1=X_train.columns[~rfe.support_]

In [None]:
# calculating VIF for the variables
def calculateVIF(bike):
    vif= pd.DataFrame()
    bike = bike.select_dtypes(include='number')  # Select only numeric columns
    bike = bike.fillna(0)  # Impute missing values with 0 (adjust as needed)
    vif['Features']=bike.columns
    vif['VIF']=[variance_inflation_factor(bike.values,i) for i in range(bike.shape[1])]
    vif['VIF'] = round(vif['VIF'],2)
    vif=vif.sort_values(by="VIF",ascending=False)
    return vif

In [None]:
X_train_rfe = X_train[col1]

In [None]:
X_train_rfe.info()

In [None]:
calculateVIF(X_train_rfe)

In [None]:
##Step 5 :building a linear model
X_train_rfe = X_train[col]

In [None]:

X_train_lm_1 = sm.add_constant(X_train_rfe)
lr_1 = sm.OLS(y_train,X_train_lm_1).fit()
print(lr_1.summary())

In [None]:
X_train_new = X_train_rfe.drop(['humidity'], axis = 1)

# Run the function to calculate VIF for the new model
calculateVIF(X_train_new)

In [None]:
X_train_lm_2 = sm.add_constant(X_train_new)
lr_2 = sm.OLS(y_train,X_train_lm_2).fit()
print(lr_2.summary())

In [None]:
X_train_new = X_train_new.drop(['nov'], axis = 1)

# Run the function to calculate VIF for the new model
calculateVIF(X_train_new)

In [None]:
X_train_lm_3 = sm.add_constant(X_train_new)
lr_3 = sm.OLS(y_train,X_train_lm_3).fit()
print(lr_3.summary())

In [None]:
# We can drop dec variable as it has high p-value
X_train_new = X_train_new.drop(['dec'], axis = 1)

# Run the function to calculate VIF for the new model
calculateVIF(X_train_new)

In [None]:

X_train_lm_4 = sm.add_constant(X_train_new)
lr_4 = sm.OLS(y_train,X_train_lm_4).fit()
print(lr_4.summary())

In [None]:
# We can drop jan variable as it has high p-value
X_train_new = X_train_new.drop(['jan'], axis = 1)

# Run the function to calculate VIF for the new model
calculateVIF(X_train_new)

In [None]:
# Building 5th linear regression model

X_train_lm_5 = sm.add_constant(X_train_new)
lr_5 = sm.OLS(y_train,X_train_lm_5).fit()
print(lr_5.summary())

In [None]:
# We can drop july variable as it has high p-value
X_train_new = X_train_new.drop(['july'], axis = 1)

# Run the function to calculate VIF for the new model
calculateVIF(X_train_new)

In [None]:
# Building 6th linear regression model

X_train_lm_6 = sm.add_constant(X_train_new)
lr_6 = sm.OLS(y_train,X_train_lm_6).fit()
print(lr_6.summary())

In [None]:
# Checking the parameters and their coefficient values
lr_6.params

In [None]:
X_train_lm_6
y_train_pred = lr_6.predict(X_train_lm_6)

In [None]:
# Plot the histogram of the error terms

fig = plt.figure()
sns.distplot((y_train - y_train_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) 
plt.xlabel('Errors', fontsize = 18)

In [None]:
calculateVIF(X_train_new)

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(X_train_new.corr(),annot = True, cmap="RdYlGn")
plt.show()

In [None]:
# Linear relationship validation using CCPR plot
# Component and component plus residual plot

sm.graphics.plot_ccpr(lr_6, 'temp')
plt.show()

sm.graphics.plot_ccpr(lr_6, 'sep')
plt.show()

sm.graphics.plot_ccpr(lr_6, 'windspeed')
plt.show()

In [None]:
y_train_pred = lr_6.predict(X_train_lm_6)
residual = y_train - y_train_pred
sns.scatterplot(y_train,residual)
plt.plot(y_train,(y_train - y_train), '-r')
plt.xlabel('Count')
plt.ylabel('Residual')
plt.show()

In [None]:
# Applying scaling on the test dataset

num_vars = ['temp', 'atemp', 'humidity', 'windspeed','cnt']
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test.head()

In [None]:
df_test.describe()

In [None]:
y_test = df_test.pop('cnt')
X_test = df_test

In [None]:
col1 = X_train_new.columns

X_test = X_test[col1]

# Adding constant variable to test dataframe
X_test_lm_6 = sm.add_constant(X_test)

In [None]:
y_pred = lr_6.predict(X_test_lm_6)

In [None]:
r2 = r2_score(y_test, y_pred)
round(r2,4)

In [None]:
round(lr_6.params,4)

In [None]:
# Calculating Adjusted-R^2 value for the test dataset

adjusted_r2 = round(1-(1-r2)*(X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1),4)
print(adjusted_r2)

In [None]:
# Visualizing the fit on the test data
# plotting a Regression plot

plt.figure()
sns.regplot(x=y_test, y=y_pred, ci=68, fit_reg=True,scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('y_test vs y_pred', fontsize=20)
plt.xlabel('y_test', fontsize=18)
plt.ylabel('y_pred', fontsize=16)
plt.show()

# Comparision between Training and Testing dataset:
    - Train dataset R^2          : 0.833
    - Test dataset R^2           : 0.8038
    - Train dataset Adjusted R^2 : 0.829    
    - Test dataset Adjusted R^2  : 0.7944

#### Demand of bikes depend on year, holiday, temp, windspeed, sep, Light_snowrain, Misty, spring, summer and winter.
