import numpy as np
import pandas as pd
import statsmodels
import statsmodels.api as sm
import statsmodels.stats.multicomp
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('day.csv')

# Create dummies for season

In [4]:
season_dummy = pd.get_dummies(df['season'],drop_first = True)

In [5]:
season_dummy.rename(columns={2:'summer',3:'fall',4:'winter'},inplace = True)

In [6]:
season_dummy.columns

Index(['summer', 'fall', 'winter'], dtype='object')

# Create dummy for year

In [7]:
year_dummy = pd.get_dummies(df['yr'],drop_first = True)

In [8]:
year_dummy.rename(columns={1:'2019'},inplace = True)

# Create dummy for months

In [9]:
month_dummy = pd.get_dummies(df['mnth'],drop_first = True)

In [10]:
month_dummy.rename(columns={2:'feb',3:'march',4:'april',5:'may',6:'june',7:'july',8:'aug',
                            9:'sept',10:'oct',11:'nov',12:'dec'}
                   ,inplace = True)

In [11]:
month_dummy.columns

Index(['feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct',
       'nov', 'dec'],
      dtype='object')

# Create dummy for weatherSit

In [12]:
weat_dummy = pd.get_dummies(df['weathersit'],drop_first = True)

In [13]:
weat_dummy.rename(columns={2:'mist',3:'snow'},inplace = True)

In [14]:
weat_dummy.head()

Unnamed: 0,mist,snow
0,1,0
1,1,0
2,0,0
3,0,0
4,0,0


# Concat the dummy with original data

In [15]:
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [16]:
df = df[['temp','atemp','hum','windspeed','cnt']]

In [17]:
df = pd.concat([df,season_dummy,year_dummy,month_dummy,weat_dummy],axis = 1)

In [18]:
df.columns

Index(['temp', 'atemp', 'hum', 'windspeed', 'cnt', 'summer', 'fall', 'winter',
       '2019', 'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept',
       'oct', 'nov', 'dec', 'mist', 'snow'],
      dtype='object')

# Split into training and testing set

In [19]:
x_train , x_test , y_train , y_test = train_test_split(df.drop('cnt',axis = 1)
                                                       ,df['cnt'],test_size=0.25)

# Add the constant

In [20]:
x_train = sm.add_constant(x_train,prepend = False)

In [21]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [22]:
x_train.columns

Index(['temp', 'atemp', 'hum', 'windspeed', 'summer', 'fall', 'winter', '2019',
       'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct',
       'nov', 'dec', 'mist', 'snow', 'const'],
      dtype='object')

# Calculate the VIF 

In [23]:
variance_inflation_factor(x_train.values,0)  # temp

67.6039088757249

In [24]:
variance_inflation_factor(x_train.values,1)   #atemp

57.88912541919824

In [25]:
variance_inflation_factor(x_train.values,2)     # hum

2.0220813059566756

In [26]:
variance_inflation_factor(x_train.values,8)    # windspeed

1.9145279435689975

# Drop the column temp

In [27]:
x_train = x_train.drop('temp',axis = 1)

In [28]:
x_train = sm.add_constant(x_train,prepend = False)

In [29]:
x_train.columns

Index(['atemp', 'hum', 'windspeed', 'summer', 'fall', 'winter', '2019', 'feb',
       'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov',
       'dec', 'mist', 'snow', 'const'],
      dtype='object')

In [30]:
variance_inflation_factor(x_train.values,0)  # atemp

6.024085108055772

In [31]:
variance_inflation_factor(x_train.values,2)  # hum

1.2214003766674681

In [32]:
variance_inflation_factor(x_train.values,3)    # windspeed

7.534243936158367

# Create a model

In [33]:
mod1 = sm.OLS(y_train,x_train).fit()

In [34]:
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.849
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     147.3
Date:                Fri, 23 Jun 2023   Prob (F-statistic):          1.23e-200
Time:                        14:55:05   Log-Likelihood:                -4397.6
No. Observations:                 547   AIC:                             8837.
Df Residuals:                     526   BIC:                             8928.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
atemp         78.4520      9.778      8.023      0.0

In [38]:
for i in range(len(x_train.columns)):
    print(x_train.columns[i]," ",variance_inflation_factor(x_train.values,i))

atemp   6.024085108055772
hum   2.018393132889318
windspeed   1.2214003766674681
summer   7.534243936158367
fall   11.426339289924138
winter   7.929655353309492
2019   1.0384345731999856
feb   1.913701460075495
march   2.646869175796154
april   5.2908546276239345
may   6.824070442554543
june   6.57207897543355
july   9.547015015662286
aug   7.886164685603212
sept   6.811911427238677
oct   5.519805562946378
nov   4.923609781748123
dec   3.6022811426838475
mist   1.5653143017136961
snow   1.289277975231605
const   63.641290022410566
