# Multiple Linear Regression

<img src='q.png' />


<img src='w.png' />



<img src='e.png' />

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm



In [2]:
df = pd.read_csv('IPL IMB381IPL2013.csv')

df.head()

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0,...,0.0,0,307,15,20.47,8.9,13.93,2009,50000,50000
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41,...,0.0,0,29,0,0.0,14.5,0.0,2008,50000,50000
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62,...,121.01,5,1059,29,36.52,8.81,24.9,2008,200000,350000
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56,...,76.32,0,1125,49,22.96,6.23,22.14,2011,100000,850000
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93,...,120.71,28,0,0,0.0,0.0,0.0,2011,100000,800000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

In [4]:
xfeatures = df.columns

xfeatures

Index(['Sl.NO.', 'PLAYER NAME', 'AGE', 'COUNTRY', 'TEAM', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR', 'BASE PRICE',
       'SOLD PRICE'],
      dtype='object')

In [5]:
xfeatures = ['AGE', 'COUNTRY', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL']

## Feature encoding on categorical features

In [6]:
df['PLAYING ROLE'].unique()

array(['Allrounder', 'Bowler', 'Batsman', 'W. Keeper'], dtype=object)

In [7]:
pd.get_dummies(df['PLAYING ROLE'])[:5]

Unnamed: 0,Allrounder,Batsman,Bowler,W. Keeper
0,1,0,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0


In [8]:
categorical_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']

encoded_df = pd.get_dummies(df[xfeatures],
                           columns=categorical_features,
                           drop_first=True)

encoded_df.columns

Index(['T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL',
       'ECON', 'SR-BL', 'AGE_2', 'AGE_3', 'COUNTRY_BAN', 'COUNTRY_ENG',
       'COUNTRY_IND', 'COUNTRY_NZ', 'COUNTRY_PAK', 'COUNTRY_SA', 'COUNTRY_SL',
       'COUNTRY_WI', 'COUNTRY_ZIM', 'PLAYING ROLE_Batsman',
       'PLAYING ROLE_Bowler', 'PLAYING ROLE_W. Keeper', 'CAPTAINCY EXP_1'],
      dtype='object')

In [9]:
xfeatures = encoded_df.columns

encoded_df.head()

Unnamed: 0,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,ODI-WKTS,ODI-SR-BL,RUNS-S,HS,AVE,SR-B,...,COUNTRY_NZ,COUNTRY_PAK,COUNTRY_SA,COUNTRY_SL,COUNTRY_WI,COUNTRY_ZIM,PLAYING ROLE_Batsman,PLAYING ROLE_Bowler,PLAYING ROLE_W. Keeper,CAPTAINCY EXP_1
0,0,0,0,0.0,0,0.0,0,0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,214,18,657,71.41,185,37.6,0,0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,571,58,1269,80.62,288,32.9,167,39,18.56,121.01,...,0,0,0,0,0,0,0,1,0,0
3,284,31,241,84.56,51,36.8,58,11,5.8,76.32,...,0,0,0,0,0,0,0,1,0,0
4,63,0,79,45.93,0,0.0,1317,71,32.93,120.71,...,0,0,0,0,0,0,1,0,0,0


In [10]:
from sklearn.model_selection import train_test_split

x = sm.add_constant(encoded_df)
y = df['SOLD PRICE']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, 
                                                random_state=100)

In [11]:
model1 = sm.OLS(ytrain, xtrain).fit()

model1.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.366
Dependent Variable:,SOLD PRICE,AIC:,2963.5554
Date:,2024-03-13 22:24,BIC:,3048.176
No. Observations:,104,Log-Likelihood:,-1449.8
Df Model:,31,F-statistic:,2.921
Df Residuals:,72,Prob (F-statistic):,9.52e-05
R-squared:,0.557,Scale:,108520000000.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,379305.9641,227526.1170,1.6671,0.0998,-74259.0605,832870.9888
T-RUNS,-36.5141,32.1190,-1.1368,0.2594,-100.5422,27.5139
T-WKTS,-461.0001,607.6529,-0.7587,0.4505,-1672.3340,750.3339
ODI-RUNS-S,31.6404,36.2714,0.8723,0.3859,-40.6653,103.9460
ODI-SR-B,432.0709,1676.0907,0.2578,0.7973,-2909.1549,3773.2967
ODI-WKTS,1429.4361,822.9541,1.7370,0.0867,-211.0928,3069.9651
ODI-SR-BL,-918.2433,1542.6974,-0.5952,0.5536,-3993.5544,2157.0678
RUNS-S,112.0031,163.7830,0.6839,0.4963,-214.4923,438.4985
HS,-4358.4779,2725.4224,-1.5992,0.1142,-9791.5082,1074.5524

0,1,2,3
Omnibus:,5.237,Durbin-Watson:,1.792
Prob(Omnibus):,0.073,Jarque-Bera (JB):,4.625
Skew:,0.487,Prob(JB):,0.099
Kurtosis:,3.344,Condition No.:,76726.0


# multicollinearity

<img src='ee.png' />


# VIF (Variance Inflation Factor)

<img src='rr.png' />