# Importing all major libraries

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [104]:
# Importing the data
data=pd.read_csv('ipl.csv', parse_dates=True)
data.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


### Basic data checks

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   bat_team        76014 non-null  object 
 4   bowl_team       76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


#### As you can see above and below there are no null values.

In [106]:
data.isnull().sum()

mid               0
date              0
venue             0
bat_team          0
bowl_team         0
batsman           0
bowler            0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
striker           0
non-striker       0
total             0
dtype: int64

In [107]:
# shape of the data
data.shape

(76014, 15)

In [108]:
# checking unique values in the columns
data.nunique()

mid               617
date              442
venue              35
bat_team           14
bowl_team          14
batsman           411
bowler            329
runs              252
wickets            11
overs             140
runs_last_5       102
wickets_last_5      8
striker           155
non-striker        88
total             138
dtype: int64

In [109]:
# adding a column of mean venue score which might be important to predict scores

data['mean_venue_score']=data.groupby('venue')['total'].transform('mean').astype(int)

data_venue_meanscore=data[['venue', 'mean_venue_score']].drop_duplicates(keep='first')


In [110]:
data.head(1)

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total,mean_venue_score
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222,169


#### As can be seen above, mean_venue_score has been added which will surely be beneficial in our predictions

In [111]:
# remove unwanted columns now:
cols_to_remove=['mid', 'venue','batsman', 'bowler', 'striker', 'non-striker']

In [112]:
data.drop(columns=cols_to_remove, inplace=True)

In [113]:
data.head(1)

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total,mean_venue_score
0,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,1,0,0.1,1,0,222,169


In [114]:
# changing date column to pandas datetime object
data['date']=pd.to_datetime(data['date'], format="%Y-%m-%d")

In [115]:
# date column changed to datetime object
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              76014 non-null  datetime64[ns]
 1   bat_team          76014 non-null  object        
 2   bowl_team         76014 non-null  object        
 3   runs              76014 non-null  int64         
 4   wickets           76014 non-null  int64         
 5   overs             76014 non-null  float64       
 6   runs_last_5       76014 non-null  int64         
 7   wickets_last_5    76014 non-null  int64         
 8   total             76014 non-null  int64         
 9   mean_venue_score  76014 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(5), object(2)
memory usage: 5.5+ MB


In [116]:
### we need to focus only on the teams that are currently playing

In [117]:

current_playing_teams=['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

#### filtering the data by keeping only current playing teams in the dataset as predictions will be made only for them

In [118]:
data=data[(data['bat_team'].isin(current_playing_teams)) & (data['bowl_team'].isin(current_playing_teams))]

In [119]:
# Removing data less than 5 overs as prediction will require minimum 5 overs data
data=data[data['overs']>5.0]

In [120]:
#checking the shape of the dataset again
data.shape

(40088, 10)

In [121]:
data.head()

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total,mean_venue_score
32,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,0,5.1,59,0,222,169
33,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.2,59,1,222,169
34,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.3,59,1,222,169
35,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.4,59,1,222,169
36,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.5,58,1,222,169


### Changing categorical columns to numerical ones using pd.get_dummies

In [122]:
data=pd.get_dummies(data, columns=['bat_team', 'bowl_team'])

In [123]:
new_columns=data.columns
new_columns=['date','bat_team_Mumbai Indians','bat_team_Kolkata Knight Riders', 'bat_team_Chennai Super Kings',
             'bat_team_Rajasthan Royals','bat_team_Kings XI Punjab', 'bat_team_Royal Challengers Bangalore', 
             'bat_team_Delhi Daredevils', 'bat_team_Sunrisers Hyderabad', 
             'bowl_team_Mumbai Indians','bowl_team_Kolkata Knight Riders', 'bowl_team_Chennai Super Kings',
             'bowl_team_Rajasthan Royals','bowl_team_Kings XI Punjab', 'bowl_team_Royal Challengers Bangalore', 
             'bowl_team_Delhi Daredevils', 'bowl_team_Sunrisers Hyderabad', 
             'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5',
       'total', 'mean_venue_score']

In [124]:
data_new=data[new_columns]

### Since this is a time series data, we cannot simply divide the dataset into train and test set using train_test_set.Instead we need to divide the data using year 

In [125]:
X_train=data_new[data_new['date'].dt.year<=2016].drop(columns=['total'])
X_test=data_new[data_new['date'].dt.year>=2017].drop(columns=['total'])

y_train=data_new.loc[data_new['date'].dt.year<=2016,'total']
y_test=data_new.loc[data_new['date'].dt.year>=2017, 'total']

In [126]:
# Removing date column from X_train and X_test
X_train.drop(columns=['date'], inplace=True)
X_test.drop(columns=['date'], inplace=True)

### Linear Regression

In [128]:


from sklearn.linear_model import LinearRegression

reg_linear=LinearRegression()

reg_linear.fit(X_train, y_train)
y_pred=reg_linear.predict(X_test)



In [132]:
from sklearn.metrics import r2_score, mean_squared_error

In [134]:
score=r2_score(y_test, y_pred)
rmse=np.sqrt(mean_squared_error(y_test, y_pred))

In [135]:
print("R2 score for Linear Regression is ", score)
print("Root Mean Square error for Linear Regression is", rmse)

R2 score for Linear Regression is  0.7395312109767288
Root Mean Square error for Linear Regression is 16.24525138306303


## Lasso Regression

In [153]:
from sklearn.linear_model import Lasso
reg_lasso=Lasso(alpha=0.1)

reg_lasso.fit(X_train, y_train)
y_pred_lasso=reg_lasso.predict(X_test)


In [154]:
score_lasso=r2_score(y_test, y_pred_lasso)
rmse_lasso=np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print("R2 score for Lasso Regression is ", score_lasso)
print("Root Mean Square error for Lasso Regression is", rmse_lasso)

R2 score for Linear Regression is  0.7365241796880464
Root Mean Square error for Linear Regression is 16.338755479923154


## RandomForest Regressor

In [157]:
from sklearn.ensemble import RandomForestRegressor

reg_rf=RandomForestRegressor(n_estimators=1000)
reg_rf.fit(X_train, y_train)
y_pred_rf=reg_rf.predict(X_test)

In [159]:
score_rf=r2_score(y_test, y_pred_rf)
rmse_rf=np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("R2 score for Random Forest Regression is ", score_rf)
print("Root Mean Square error for Random Forest Regression is", rmse_rf)

R2 score for Random Forest Regression is  0.6583202835867659
Root Mean Square error for Random Forest Regression is 18.60622162331439


## XGB Regressor

In [161]:
from xgboost import XGBRegressor

In [162]:
reg_xgb=XGBRegressor()
reg_xgb.fit(X_train, y_train)
y_pred_xgb=reg_rf.predict(X_test)

In [163]:
score_xgb=r2_score(y_test, y_pred_xgb)
rmse_xgb=np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print("R2 score for Random Forest Regression is ", score_xgb)
print("Root Mean Square error for Random Forest Regression is", rmse_xgb)

R2 score for Random Forest Regression is  0.6583202835867659
Root Mean Square error for Random Forest Regression is 18.60622162331439


In [164]:
import pickle

In [165]:
filename='ipl_score_predictor.pkl'

pickle.dump(reg_linear, open(filename, 'wb'))


In [166]:
X_train.columns

Index(['bat_team_Mumbai Indians', 'bat_team_Kolkata Knight Riders',
       'bat_team_Chennai Super Kings', 'bat_team_Rajasthan Royals',
       'bat_team_Kings XI Punjab', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Delhi Daredevils', 'bat_team_Sunrisers Hyderabad',
       'bowl_team_Mumbai Indians', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Chennai Super Kings', 'bowl_team_Rajasthan Royals',
       'bowl_team_Kings XI Punjab', 'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Delhi Daredevils', 'bowl_team_Sunrisers Hyderabad', 'overs',
       'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'mean_venue_score'],
      dtype='object')