In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
import statsmodels.api as sm
import warnings


# import RFE and Linear Regression for model building
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# for splitting the data between train and test data set
from sklearn.model_selection import train_test_split

# for rescaling the features
from sklearn.preprocessing import MinMaxScaler

# check for VIF values of the feature variables
from statsmodels.stats.outliers_influence import variance_inflation_factor

# to ignore warnings
warnings.filterwarnings('ignore')

### Step 1. Reading, Understanding and Visualizing the data

#### Step 1.a - Reading the data

In [13]:
# reading data in a dataframe
day_df = pd.read_csv('day.csv')

#### Step 1.b Understanding the data

In [14]:
day_df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [17]:
# number of records and features
day_df.shape

(730, 16)

In [18]:
# Checking for null values in day_df
day_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    int64  
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.4+ KB


##### Some observations
   - There are no null values
   - dteday is of object type

In [19]:
# Statistical information about the data
day_df.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,365.5,2.49863,0.5,6.526027,0.028767,2.99726,0.683562,1.394521,20.319259,23.726322,62.765175,12.76362,849.249315,3658.757534,4508.006849
std,210.877136,1.110184,0.500343,3.450215,0.167266,2.006161,0.465405,0.544807,7.506729,8.150308,14.237589,5.195841,686.479875,1559.758728,1936.011647
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.424346,3.95348,0.0,1.500244,2.0,20.0,22.0
25%,183.25,2.0,0.0,4.0,0.0,1.0,0.0,1.0,13.811885,16.889713,52.0,9.04165,316.25,2502.25,3169.75
50%,365.5,3.0,0.5,7.0,0.0,3.0,1.0,1.0,20.465826,24.368225,62.625,12.125325,717.0,3664.5,4548.5
75%,547.75,3.0,1.0,10.0,0.0,5.0,1.0,2.0,26.880615,30.445775,72.989575,15.625589,1096.5,4783.25,5966.0
max,730.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,35.328347,42.0448,97.25,34.000021,3410.0,6946.0,8714.0


##### Some observations
 - Not all data is on same scale. We will use MinMaxScalar for them.

### EDA - Lets drop features which will not help in prediction due to any reason
 - instant - Its just an index column
 - dteday - We can use this for deriving year or month but we already have them as other columns
 - casual and registered - they will cause data leakage as they will not be available for at the time of prediction    
    -  Also the error in predicting casual and registered will be more than the count (cnt).

In [20]:
features_to_be_dropped = ['instant', 'dteday', 'casual', 'registered'] 
day_df.drop(features_to_be_dropped, axis=1, inplace=True)

In [21]:
day_df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,985
1,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,801
2,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,1349
3,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,1562
4,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,1600


In [22]:
day_df.shape

(730, 12)

In [23]:
# There are some features which are actually categorical but they look continuous
# For categorical its better to use boxplot to better visualize 
# Lets put lables for them - season, mnth, weekday, weathersit
day_df['season'] = day_df['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
day_df['mnth'] = day_df['mnth'].map({1:'jan', 2:'feb', 3:'mar', 4:'apr', 5:'may', 6:'june',
                                      7:'july', 8:'aug', 9:'sep', 10:'oct', 11:'nov', 12:'dec'})
day_df['weekday'] = day_df['weekday'].map({0:'sun', 1:'mon', 2:'tue', 3:'wed', 
                                           4:'thu', 5:'fri', 6:'sat'})                       
day_df['weathersit']= day_df['weathersit'].map({1:'Clear', 2:'Misty', 
                                                3:'Light_Snow', 
                                                4:'Heavy_Rain'})

In [24]:
day_df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,0,jan,0,sat,0,Misty,14.110847,18.18125,80.5833,10.749882,985
1,spring,0,jan,0,sun,0,Misty,14.902598,17.68695,69.6087,16.652113,801
2,spring,0,jan,0,mon,1,Clear,8.050924,9.47025,43.7273,16.636703,1349
3,spring,0,jan,0,tue,1,Clear,8.2,10.6061,59.0435,10.739832,1562
4,spring,0,jan,0,wed,1,Clear,9.305237,11.4635,43.6957,12.5223,1600
