## Kaggle Setup

In [None]:
!pip install -q kaggle

In [None]:
import os

# Move the uploaded API key to the correct directory
os.makedirs("/root/.kaggle", exist_ok=True)
os.rename("kaggle (1).json", "/root/.kaggle/kaggle.json")

# Set file permissions for the API key
os.chmod("/root/.kaggle/kaggle.json",600)

In [None]:
!kaggle datasets download -d dgsports/ipl-ball-by-ball-2008-to-2022

Downloading ipl-ball-by-ball-2008-to-2022.zip to /content
  0% 0.00/3.57M [00:00<?, ?B/s]
100% 3.57M/3.57M [00:00<00:00, 110MB/s]


In [None]:
!unzip -q ipl-ball-by-ball-2008-to-2022.zip

## Feature Extraction

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/content/IPL_ball_by_ball_updated.csv')
df.head(4)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [None]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [None]:
df = df[['match_id', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'runs_off_bat', 'extras', 'wides', 'noballs', 'player_dismissed']]

In [None]:
df.head()

Unnamed: 0,match_id,venue,innings,ball,batting_team,bowling_team,runs_off_bat,extras,wides,noballs,player_dismissed
0,335982,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,,,
1,335982,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,,,
2,335982,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1.0,,
3,335982,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,,,
4,335982,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,,,


In [None]:
df['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans'], dtype=object)

Data Preprocessing

In [None]:
# Settings the single name for teams

df.loc[df['batting_team'] == 'Kings XI Punjab', 'batting_team'] = 'Punjab Kings'
df.loc[df['bowling_team'] == 'Kings XI Punjab', 'bowling_team'] = 'Punjab Kings'

df.loc[df['batting_team'] == 'Delhi Daredevils', 'batting_team'] = 'Delhi Capitals'
df.loc[df['bowling_team'] == 'Delhi Daredevils', 'bowling_team'] = 'Delhi Capitals'

df.loc[df['batting_team'] == 'Rising Pune Supergiant', 'batting_team'] = 'Rising Pune Supergiants'
df.loc[df['bowling_team'] == 'Rising Pune Supergiant', 'bowling_team'] = 'Rising Pune Supergiants'

In [None]:
# droping data with these teams who has only play for two seasons
index1=df[df['batting_team'].isin(['Gujarat Lions','Pune Warriors', 'Deccan Chargers', 'Kochi Tuskers Kerala','Rising Pune Supergiants'])].index
df.drop(index1, inplace=True)


In [None]:
index2=df[df['bowling_team'].isin(['Gujarat Lions','Pune Warriors', 'Deccan Chargers', 'Kochi Tuskers Kerala','Rising Pune Supergiants'])].index
df.drop(index2, inplace=True)

In [None]:
stadium_data = {
    'M Chinnaswamy Stadium': 'Bengaluru',
    'Punjab Cricket Association Stadium, Mohali': 'Mohali',
    'Feroz Shah Kotla': 'Delhi',
    'Wankhede Stadium': 'Mumbai',
    'Eden Gardens': 'Kolkata',
    'Sawai Mansingh Stadium': 'Jaipur',
    'Rajiv Gandhi International Stadium, Uppal': 'Hyderabad',
    'MA Chidambaram Stadium, Chepauk': 'Chennai',
    'Dr DY Patil Sports Academy': 'Mumbai',
    'Newlands': 'Cape Town',  # Add Newlands with its city
    "St George's Park": 'Port Elizabeth',
    'Kingsmead': 'Durban',
    'SuperSport Park': 'Centurion',
    'Buffalo Park': 'East London',
    'New Wanderers Stadium': 'Johannesburg',
    'De Beers Diamond Oval': 'Kimberley',
    'OUTsurance Oval': 'Bloemfontein',
    'Brabourne Stadium': 'Mumbai',
    'Sardar Patel Stadium, Motera': 'Ahmedabad',
    'Barabati Stadium': 'Cuttack',
    'Brabourne Stadium, Mumbai': 'Mumbai',
    'Saurashtra Cricket Association Stadium':'Rajkot',
    'Vidarbha Cricket Association Stadium, Jamtha': 'Nagpur',
    'Himachal Pradesh Cricket Association Stadium': 'Dharamsala',
    'Nehru Stadium': 'Pune',
    'Holkar Cricket Stadium': 'Indore',
    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium': 'Visakhapatnam',
    'Subrata Roy Sahara Stadium': 'Pune',
    'Shaheed Veer Narayan Singh International Stadium': 'Raipur',
    'JSCA International Stadium Complex': 'Ranchi',
    'Sheikh Zayed Stadium': 'Abu Dhabi',
    'Sharjah Cricket Stadium': 'Sharjah',
    'Dubai International Cricket Stadium': 'Dubai',
    'Maharashtra Cricket Association Stadium': 'Pune',
    'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Mohali',
    "Punjab Cricket Association IS Bindra Stadium": 'Mohali',
    'M.Chinnaswamy Stadium': 'Bengaluru',
    'Rajiv Gandhi International Stadium': 'Hyderabad',
    'MA Chidambaram Stadium': 'Chennai',
    'Arun Jaitley Stadium': 'Delhi',
    'MA Chidambaram Stadium, Chepauk, Chennai': 'Chennai',
    'Wankhede Stadium, Mumbai': 'Mumbai',
    'Narendra Modi Stadium, Ahmedabad': 'Ahmedabad',
    'Arun Jaitley Stadium, Delhi': 'Delhi',
    'Zayed Cricket Stadium, Abu Dhabi': 'Abu Dhabi',
    'Dr DY Patil Sports Academy, Mumbai': 'Mumbai',
    'Maharashtra Cricket Association Stadium, Pune': 'Pune',
    'Eden Gardens, Kolkata': 'Kolkata',
    'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh': 'Mohali',
    'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 'Lucknow',
    'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 'Hyderabad',
    'M Chinnaswamy Stadium, Bengaluru': 'Bengaluru',
    'Barsapara Cricket Stadium, Guwahati': 'Guwahati',
    'Sawai Mansingh Stadium, Jaipur': 'Jaipur',
    'Green Park':'Kanpur',
    'Himachal Pradesh Cricket Association Stadium, Dharamsala': 'Dharamsala'
}

df['city'] = df['venue'].map(stadium_data)

In [None]:
# creating a new column for total runs from that ball
df['runs']=df['runs_off_bat']+df['extras']

In [None]:
df.fillna(0, inplace=True)
df['extra_ball'] = df['wides']+df['noballs']

In [None]:
df['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Punjab Kings', 'Rajasthan Royals',
       'Delhi Capitals', 'Mumbai Indians', 'Sunrisers Hyderabad',
       'Lucknow Super Giants', 'Gujarat Titans'], dtype=object)

## 1st inning

In [None]:
new_df = df[df['innings']==1]

In [None]:
new_df.head()

Unnamed: 0,match_id,venue,innings,ball,batting_team,bowling_team,runs_off_bat,extras,wides,noballs,player_dismissed,city,runs,extra_ball
0,335982,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,0.0,0.0,0,Bengaluru,1,0.0
1,335982,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,0.0,0.0,0,Bengaluru,0,0.0
2,335982,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1.0,0.0,0,Bengaluru,1,1.0
3,335982,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,0.0,0.0,0,Bengaluru,0,0.0
4,335982,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,0,0,0.0,0.0,0,Bengaluru,0,0.0


In [None]:
new_df['current_score'] = new_df.groupby('match_id').cumsum()['runs']

  new_df['current_score'] = new_df.groupby('match_id').cumsum()['runs']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['current_score'] = new_df.groupby('match_id').cumsum()['runs']


In [None]:
new_df = new_df[['match_id', 'ball', 'batting_team', 'bowling_team','city', 'runs','extra_ball', 'current_score','player_dismissed']]

In [None]:
new_df['overs']=new_df['ball'].apply(lambda x: str(x).split('.')[0])
new_df['bowls']=new_df['ball'].apply(lambda x: str(x).split('.')[1])

In [None]:
new_df['balls_bowled'] = (new_df['overs'].astype('int')*6) + new_df['bowls'].astype('int')

In [None]:
new_df['balls_left'] = 120 - new_df['balls_bowled']
new_df['balls_left'] = new_df['balls_left'].apply(lambda x:0 if x<0 else x)

In [None]:
new_df = new_df[['match_id', 'batting_team', 'bowling_team','city', 'runs','current_score', 'balls_bowled', 'balls_left','player_dismissed']]

In [None]:
new_df['player_dismissed'] = new_df['player_dismissed'].apply(lambda x:0 if x==0 else 1)
new_df['player_dismissed'] = new_df['player_dismissed'].astype('int')
new_df['player_dismissed'] = new_df.groupby('match_id').cumsum()['player_dismissed']
new_df['wickets_left'] = 10 - new_df['player_dismissed']

  new_df['player_dismissed'] = new_df.groupby('match_id').cumsum()['player_dismissed']


In [None]:
new_df['crr'] =  (new_df['current_score']*6) / new_df['balls_bowled']

In [None]:
new_df

Unnamed: 0,match_id,batting_team,bowling_team,city,runs,current_score,balls_bowled,balls_left,player_dismissed,wickets_left,crr
0,335982,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,1,1,1,119,0,10,6.000000
1,335982,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,1,2,118,0,10,3.000000
2,335982,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,1,2,3,117,0,10,4.000000
3,335982,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,2,4,116,0,10,3.000000
4,335982,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,2,5,115,0,10,2.400000
...,...,...,...,...,...,...,...,...,...,...,...
243716,1370353,Gujarat Titans,Chennai Super Kings,Ahmedabad,6,212,116,4,2,8,10.965517
243717,1370353,Gujarat Titans,Chennai Super Kings,Ahmedabad,0,212,117,3,3,7,10.871795
243718,1370353,Gujarat Titans,Chennai Super Kings,Ahmedabad,1,213,118,2,3,7,10.830508
243719,1370353,Gujarat Titans,Chennai Super Kings,Ahmedabad,1,214,119,1,3,7,10.789916


In [None]:
final_df = new_df.groupby('match_id').sum()['runs'].reset_index().merge(new_df,on='match_id')

  final_df = new_df.groupby('match_id').sum()['runs'].reset_index().merge(new_df,on='match_id')


In [None]:
final_df

Unnamed: 0,match_id,runs_x,batting_team,bowling_team,city,runs_y,current_score,balls_bowled,balls_left,player_dismissed,wickets_left,crr
0,335982,222,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,1,1,1,119,0,10,6.000000
1,335982,222,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,1,2,118,0,10,3.000000
2,335982,222,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,1,2,3,117,0,10,4.000000
3,335982,222,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,2,4,116,0,10,3.000000
4,335982,222,Kolkata Knight Riders,Royal Challengers Bangalore,Bengaluru,0,2,5,115,0,10,2.400000
...,...,...,...,...,...,...,...,...,...,...,...,...
103395,1370353,214,Gujarat Titans,Chennai Super Kings,Ahmedabad,6,212,116,4,2,8,10.965517
103396,1370353,214,Gujarat Titans,Chennai Super Kings,Ahmedabad,0,212,117,3,3,7,10.871795
103397,1370353,214,Gujarat Titans,Chennai Super Kings,Ahmedabad,1,213,118,2,3,7,10.830508
103398,1370353,214,Gujarat Titans,Chennai Super Kings,Ahmedabad,1,214,119,1,3,7,10.789916


In [None]:
final_df = final_df[['batting_team', 'bowling_team', 'city', 'runs_x','current_score', 'balls_left', 'wickets_left', 'crr']]

In [None]:
final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [None]:
final_df.rename(columns={'runs_x':"total_runs"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'runs_x':"total_runs"}, inplace=True)


In [None]:
# shuffling data
final_df = final_df.sample(final_df.shape[0])

In [None]:
final_df

Unnamed: 0,batting_team,bowling_team,city,total_runs,current_score,balls_left,wickets_left,crr
6534,Mumbai Indians,Kolkata Knight Riders,Port Elizabeth,187,10,113,10,8.571429
62726,Royal Challengers Bangalore,Chennai Super Kings,Chennai,70,38,76,7,5.181818
10615,Punjab Kings,Delhi Capitals,Mohali,142,126,12,2,7.000000
63709,Mumbai Indians,Punjab Kings,Mohali,176,79,69,8,9.294118
10682,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata,135,49,67,6,5.547170
...,...,...,...,...,...,...,...,...
15380,Punjab Kings,Chennai Super Kings,Dharamsala,192,2,118,10,6.000000
74173,Royal Challengers Bangalore,Chennai Super Kings,Dubai,169,57,63,9,6.000000
49137,Kolkata Knight Riders,Mumbai Indians,Mumbai,174,162,8,5,8.678571
18255,Chennai Super Kings,Kolkata Knight Riders,Kolkata,114,32,64,8,3.428571


In [None]:
final_df.to_csv('clean_full_ipl_data.csv', index=False)

## Model Building

In [None]:
import pandas as pd

df = pd.read_csv('/content/clean_full_ipl_data.csv')

In [None]:
df.head()

Unnamed: 0,batting_team,bowling_team,city,total_runs,current_score,balls_left,wickets_left,crr
0,Mumbai Indians,Kolkata Knight Riders,Port Elizabeth,187,10,113,10,8.571429
1,Royal Challengers Bangalore,Chennai Super Kings,Chennai,70,38,76,7,5.181818
2,Punjab Kings,Delhi Capitals,Mohali,142,126,12,2,7.0
3,Mumbai Indians,Punjab Kings,Mohali,176,79,69,8,9.294118
4,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata,135,49,67,6,5.54717


In [None]:
# spliting dependent and independent data
X = df.drop(columns=['total_runs'])
y = df['total_runs']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [None]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [None]:
pipe = Pipeline(
    steps=[
        ('step1', trf),
        ('step2', StandardScaler()),
        ('step3', XGBRegressor(n_estimators=2000, learning_rate=0.02, max_depth=14, random_state=42))
    ]
)

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)
print(f"R2 Score - {r2_score(y_test,y_pred)}")
print(f"MAE - {mean_absolute_error(y_test,y_pred)}")



R2 Score - 0.9010203872317795
MAE - 4.415067323725275


In [None]:
input_df = pd.DataFrame({
    "batting_team": ["Royal Challengers Bangalore"],
    "bowling_team": ["Mumbai Indians"],
    "city": ["Mumbai"],
    "current_score": [56],
    "balls_left": [90],
    "wickets_left": [8],
    "crr": [9],
})

In [None]:
pipe.predict(input_df)

array([201.32329], dtype=float32)

In [None]:
import pickle

In [None]:
pickle.dump(pipe,open('ipl_score_prediction_pipe_without_last_five.pkl','wb'))

In [None]:
import xgboost

In [None]:
xgboost.__version__

'2.0.3'