In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
ball_by_ball = pd.read_csv('./Data/IPL_Ball_by_Ball_2008_2022.csv')
matches_result = pd.read_csv('./Data/IPL_Matches_Result_2008_2022.csv')
ipl_2023_teams = pd.read_csv('./Data/Ipl_2023 _cricketers - Team name.csv').rename(columns={
    'Teams': 'team'
})
ipl_2023_venues = pd.read_csv('./Data/Ipl_2023 _cricketers - Venue.csv').rename(columns={
    'Venue': 'venue'
})

In [3]:
def log(*args):
    print('👉', *args)
    

In [4]:
def to_kebab_case(string):
    return '-'.join(
        string.replace(",", "").replace(".", "").split()
    ).lower()

In [5]:
np.random.seed(2)

# Preparing training dataset

- ## Change column names, drop unnecessary columns [in ball_by_ball, matches_result]

In [6]:
ball_by_ball_orig = ball_by_ball

ball_by_ball = ball_by_ball.rename(columns={
    'ID': 'match_id',
    'ballnumber': 'ball_number',
    'non-striker': 'non_striker',
    'BattingTeam': 'batting_team',
}).loc[:, [
    'match_id',
    'innings',
    'batting_team',
    'overs',
    'ball_number',
    'batter',
    'bowler',
    'total_run',
]]

In [7]:
matches_result_orig = matches_result

matches_result = matches_result.rename(columns={
    'ID': 'match_id',
    'Team1': 'team_1',
    'Team2': 'team_2',
    'Venue': 'venue',
}).loc[:, [
    'match_id',
    'team_1',
    'team_2',
    'venue',
]]

In [8]:
print(ball_by_ball_orig.shape)
ball_by_ball_orig.head()

(225954, 17)


Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals


In [9]:
print(matches_result_orig.shape)
matches_result_orig.head()

(950, 20)


Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan


In [10]:
print(ball_by_ball.shape)
ball_by_ball.head()

(225954, 8)


Unnamed: 0,match_id,innings,batting_team,overs,ball_number,batter,bowler,total_run
0,1312200,1,Rajasthan Royals,0,1,YBK Jaiswal,Mohammed Shami,0
1,1312200,1,Rajasthan Royals,0,2,YBK Jaiswal,Mohammed Shami,1
2,1312200,1,Rajasthan Royals,0,3,JC Buttler,Mohammed Shami,1
3,1312200,1,Rajasthan Royals,0,4,YBK Jaiswal,Mohammed Shami,0
4,1312200,1,Rajasthan Royals,0,5,YBK Jaiswal,Mohammed Shami,0


In [11]:
print(matches_result.shape)
matches_result.head()

(950, 4)


Unnamed: 0,match_id,team_1,team_2,venue
0,1312200,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad"
1,1312199,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad"
2,1312198,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata"
3,1312197,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata"
4,1304116,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai"


- ## Some stats

In [12]:
log('ball_by_ball match_id.nunique:', ball_by_ball.match_id.nunique())
log('ball_by_ball batting_team.nunique:', ball_by_ball.batting_team.nunique())
log('ball_by_ball union1d(batter, bowler).shape:', np.union1d(
    ball_by_ball.batter.unique(), ball_by_ball.bowler.unique()
).shape)
log('ball_by_ball innings.unique:', ball_by_ball.innings.unique())
log('ball_by_ball overs.unique:', ball_by_ball.overs.unique())

👉 ball_by_ball match_id.nunique: 950
👉 ball_by_ball batting_team.nunique: 18
👉 ball_by_ball union1d(batter, bowler).shape: (652,)
👉 ball_by_ball innings.unique: [1 2 3 4 5 6]
👉 ball_by_ball overs.unique: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [13]:
log('matches_result match_id.nunique:', matches_result.match_id.nunique())
log('matches_result venue.nunique:', matches_result.venue.nunique())
log('matches_result union1d(team_1, team_2).shape:', np.union1d(
    matches_result.team_1.unique(), matches_result.team_2.unique()
).shape)

👉 matches_result match_id.nunique: 950
👉 matches_result venue.nunique: 49
👉 matches_result union1d(team_1, team_2).shape: (18,)


- ## Get Venues Mapping

In [14]:
matches_result_orig.groupby(['City', 'Venue'], dropna=False)['Venue'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq
City,Venue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abu Dhabi,Sheikh Zayed Stadium,29,1,Sheikh Zayed Stadium,29
Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",8,1,"Zayed Cricket Stadium, Abu Dhabi",8
Ahmedabad,"Narendra Modi Stadium, Ahmedabad",7,1,"Narendra Modi Stadium, Ahmedabad",7
Ahmedabad,"Sardar Patel Stadium, Motera",12,1,"Sardar Patel Stadium, Motera",12
Bangalore,M Chinnaswamy Stadium,65,1,M Chinnaswamy Stadium,65
Bengaluru,M.Chinnaswamy Stadium,15,1,M.Chinnaswamy Stadium,15
Bloemfontein,OUTsurance Oval,2,1,OUTsurance Oval,2
Cape Town,Newlands,7,1,Newlands,7
Centurion,SuperSport Park,12,1,SuperSport Park,12
Chandigarh,Punjab Cricket Association IS Bindra Stadium,10,1,Punjab Cricket Association IS Bindra Stadium,10


👇: https://www.iplt20.com/matches/schedule/men

In [15]:
venue_mapping_normal = {
  "Arun Jaitley Stadium": "Arun Jaitley Stadium",
  "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
  "Feroz Shah Kotla": "Arun Jaitley Stadium",
  "Barsapara Cricket Stadium": "Barsapara Cricket Stadium",
  "Barsapara Cricket Stadium, Guwahati": "Barsapara Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Eden Gardens": "Eden Gardens",
  "Eden Gardens, Kolkata": "Eden Gardens",
  "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium",
  "Himachal Pradesh Cricket Association Stadium, Dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "M Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "MA Chidambaram Stadium": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chennai": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
  "Narendra Modi Stadium": "Narendra Modi Stadium",
  "Narendra Modi Stadium, Ahmedabad": "Narendra Modi Stadium",
  "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Hyderabad": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
  "Sawai Mansingh Stadium": "Sawai Mansingh Stadium",
  "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
  "Wankhede Stadium": "Wankhede Stadium",
  "Wankhede Stadium, Mumbai": "Wankhede Stadium"
}

In [16]:
venue_mapping_kebab = {
  "arun-jaitley-stadium": "Arun Jaitley Stadium",
  "arun-jaitley-stadium-delhi": "Arun Jaitley Stadium",
  "feroz-shah-kotla": "Arun Jaitley Stadium",
  "barsapara-cricket-stadium": "Barsapara Cricket Stadium",
  "barsapara-cricket-stadium-guwahati": "Barsapara Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium-lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "eden-gardens": "Eden Gardens",
  "eden-gardens-kolkata": "Eden Gardens",
  "himachal-pradesh-cricket-association-stadium": "Himachal Pradesh Cricket Association Stadium",
  "himachal-pradesh-cricket-association-stadium-dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "m-chinnaswamy-stadium": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "ma-chidambaram-stadium": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chennai": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk-chennai": "MA Chidambaram Stadium",
  "narendra-modi-stadium": "Narendra Modi Stadium",
  "narendra-modi-stadium-ahmedabad": "Narendra Modi Stadium",
  "punjab-cricket-association-is-bindra-stadium": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-is-bindra-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "rajiv-gandhi-international-stadium": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-uppal": "Rajiv Gandhi International Stadium",
  "sawai-mansingh-stadium": "Sawai Mansingh Stadium",
  "sawai-mansingh-stadium-jaipur": "Sawai Mansingh Stadium",
  "wankhede-stadium": "Wankhede Stadium",
  "wankhede-stadium-mumbai": "Wankhede Stadium"
}

In [17]:
venue_mapping_tags = {
  "delhi": "Arun Jaitley Stadium",
  "arun jaitley": "Arun Jaitley Stadium",
  "guwahati": "Barsapara Cricket Stadium",
  "barsapara": "Barsapara Cricket Stadium",
  "bhupen hazarika": "Barsapara Cricket Stadium",
  "assam cricket association stadium": "Barsapara Cricket Stadium",
  "lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "ekana": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "atal bihari": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "kolkata": "Eden Gardens",
  "eden gardens": "Eden Gardens",
  "dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "himachal pradesh": "Himachal Pradesh Cricket Association Stadium",
  "bengaluru": "M Chinnaswamy Stadium",
  "bengalore": "M Chinnaswamy Stadium",
  "chinnaswamy": "M Chinnaswamy Stadium",
  "chennai": "MA Chidambaram Stadium",
  "chepauk": "MA Chidambaram Stadium",
  "chidambaram": "MA Chidambaram Stadium",
  "ahmedabad": "Narendra Modi Stadium",
  "narendra modi": "Narendra Modi Stadium",
  "mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab cricket association": "Punjab Cricket Association IS Bindra Stadium",
  "is bindra": "Punjab Cricket Association IS Bindra Stadium",
  "hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv gandhi": "Rajiv Gandhi International Stadium",
  "jaipur": "Sawai Mansingh Stadium",
  "sawai mansingh": "Sawai Mansingh Stadium",
  "mumbai": "Wankhede Stadium",
  "wankhede": "Wankhede Stadium"
}

In [18]:
np.setdiff1d(matches_result.venue.unique(), list(venue_mapping_normal.keys()))

array(['Barabati Stadium', 'Brabourne Stadium',
       'Brabourne Stadium, Mumbai', 'Buffalo Park',
       'De Beers Diamond Oval', 'Dr DY Patil Sports Academy',
       'Dr DY Patil Sports Academy, Mumbai',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Dubai International Cricket Stadium', 'Green Park',
       'Holkar Cricket Stadium', 'JSCA International Stadium Complex',
       'Kingsmead', 'Maharashtra Cricket Association Stadium',
       'Maharashtra Cricket Association Stadium, Pune', 'Nehru Stadium',
       'New Wanderers Stadium', 'Newlands', 'OUTsurance Oval',
       'Sardar Patel Stadium, Motera',
       'Saurashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'Sharjah Cricket Stadium', 'Sheikh Zayed Stadium',
       "St George's Park", 'Subrata Roy Sahara Stadium',
       'SuperSport Park', 'Vidarbha Cricket Association Stadium, Jamtha',
       'Zayed Cricket Stadium, Abu Dhabi'], dtype=object)

- ## Get Teams Mapping

In [19]:
set(matches_result['team_1'].unique()) == set(matches_result['team_2'].unique()) == set(ball_by_ball['batting_team'].unique())

True

In [20]:
# Rajasthan Royals
# Gujarat Titans
# Royal Challengers Bangalore
# Lucknow Super Giants
# Sunrisers Hyderabad
# Punjab Kings [Kings XI Punjab]
# Delhi Capitals [Delhi Daredevils]
# Mumbai Indians
# Chennai Super Kings
# Kolkata Knight Riders

team_mapping = { # 10 teams
 'Rajasthan Royals': 'Rajasthan Royals',
 'Gujarat Titans': 'Gujarat Titans',
 'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Lucknow Super Giants': 'Lucknow Super Giants',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
    
 'Kings XI Punjab': 'Punjab Kings',
 'Punjab Kings': 'Punjab Kings',
    
 'Delhi Daredevils': 'Delhi Capitals',
 'Delhi Capitals': 'Delhi Capitals',
}

In [21]:
print(np.setdiff1d(
   list(team_mapping.keys()), matches_result['team_1'].unique()
))

print(np.setdiff1d(
    matches_result['team_1'].unique(), list(team_mapping.keys())
))

[]
['Deccan Chargers' 'Gujarat Lions' 'Kochi Tuskers Kerala' 'Pune Warriors'
 'Rising Pune Supergiant' 'Rising Pune Supergiants']


- ## Apply Venues/Teams Mapping [in matches_result, ball_by_ball]

In [22]:
matches_result.venue = matches_result.venue.map(venue_mapping_normal).fillna('Other')

matches_result.team_1 = matches_result.team_1.map(team_mapping).fillna('Other')
matches_result.team_2 = matches_result.team_2.map(team_mapping).fillna('Other')

ball_by_ball.batting_team = ball_by_ball.batting_team.map(team_mapping).fillna('Other')

In [23]:
matches_result.venue[matches_result.venue == 'Other'].shape

(359,)

In [24]:
print(matches_result.team_1[matches_result.team_1 == 'Other'].shape)
print(matches_result.team_2[matches_result.team_2 == 'Other'].shape)

(99,)
(96,)


In [25]:
ball_by_ball.batting_team[ball_by_ball.batting_team == 'Other'].shape

(23105,)

In [26]:
print(matches_result.shape)
print(ball_by_ball.shape)

(950, 4)
(225954, 8)


- ## Remove NA Teams [in ball_by_ball] and Venues [in matches_result]

In [27]:
# matches_result = matches_result.dropna(subset=['team_1', 'team_2', 'venue'])
# print(matches_result.shape)

# ball_by_ball = ball_by_ball.dropna(subset=['batting_team'])
# print(ball_by_ball.shape)

- ## Select first 6 overs, Select innings 1 & 2, Map innings (1,2) to (0,1) [in ball_by_ball]

In [28]:
ball_by_ball.innings.unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [29]:
ball_by_ball.overs.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [30]:
ball_by_ball = ball_by_ball.loc[(ball_by_ball.overs <= 5) & (ball_by_ball.innings <= 2)]
ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})
ball_by_ball.shape

(70921, 8)

In [31]:
ball_by_ball.innings.unique()

array([0, 1], dtype=int64)

In [32]:
ball_by_ball.overs.unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

- ## Grouping 

In [33]:
ball_by_ball_gb = ball_by_ball.groupby(['match_id', 'innings', 'batting_team'])

In [34]:
total_runs = ball_by_ball_gb['total_run'].sum()
batsmen = ball_by_ball_gb['batter'].unique()
bowlers = ball_by_ball_gb['bowler'].unique()

In [35]:
total_runs = total_runs.to_frame(name = 'total_runs').reset_index()
batsmen = batsmen.to_frame(name = 'batsmen').reset_index()
bowlers = bowlers.to_frame(name = 'bowlers').reset_index()

In [36]:
data = total_runs.merge(batsmen, how='right', on=['match_id','innings','batting_team'])
data = data.merge(bowlers, how='right', on=['match_id','innings','batting_team'])
data = data.merge(matches_result, on=['match_id'])

In [37]:
mask = data['batting_team'] == data['team_1']
data.loc[mask, 'bowling_team'] = data['team_2']
data.loc[~mask, 'bowling_team'] = data['team_1']

In [38]:
data.query('match_id == 829763')

Unnamed: 0,match_id,innings,batting_team,total_runs,batsmen,bowlers,team_1,team_2,venue,bowling_team
971,829763,0,Royal Challengers Bangalore,52,"[CH Gayle, AB de Villiers, V Kohli, Mandeep Si...","[TG Southee, DS Kulkarni, JP Faulkner, SR Watson]",Royal Challengers Bangalore,Rajasthan Royals,M Chinnaswamy Stadium,Rajasthan Royals


In [39]:
data.query('match_id == 829813')

Unnamed: 0,match_id,innings,batting_team,total_runs,batsmen,bowlers,team_1,team_2,venue,bowling_team
1020,829813,0,Delhi Capitals,54,"[Q de Kock, SS Iyer]","[MA Starc, AB Dinda, HV Patel, D Wiese]",Royal Challengers Bangalore,Delhi Capitals,M Chinnaswamy Stadium,Royal Challengers Bangalore
1021,829813,1,Royal Challengers Bangalore,2,"[V Kohli, CH Gayle]","[J Yadav, Z Khan]",Royal Challengers Bangalore,Delhi Capitals,M Chinnaswamy Stadium,Delhi Capitals


In [40]:
# match_id == 829763, data for one innings is missing
# match_id == 829813, total_runs for one innings is 2 (probably a mistake in data entry)
data = data.drop(data[(data['match_id'] == 829763) | (data['match_id'] == 829813)].index)

In [41]:
# get count of batsmen & bowlers for each innings 
data['count_batsmen'] = [len(x) for x in data['batsmen']]
data['count_bowlers'] = [len(x) for x in data['bowlers']]

In [42]:
data = data[
    ['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers', 'total_runs']
]

# Final  training dataset 

In [43]:
data

Unnamed: 0,venue,innings,batting_team,bowling_team,count_batsmen,count_bowlers,total_runs
0,M Chinnaswamy Stadium,0,Kolkata Knight Riders,Royal Challengers Bangalore,3,3,61
1,M Chinnaswamy Stadium,1,Royal Challengers Bangalore,Kolkata Knight Riders,6,3,26
2,Punjab Cricket Association IS Bindra Stadium,0,Chennai Super Kings,Punjab Kings,3,3,53
3,Punjab Cricket Association IS Bindra Stadium,1,Punjab Kings,Chennai Super Kings,2,2,63
4,Arun Jaitley Stadium,0,Rajasthan Royals,Delhi Capitals,4,3,40
...,...,...,...,...,...,...,...
1893,Eden Gardens,1,Lucknow Super Giants,Royal Challengers Bangalore,4,3,62
1894,Narendra Modi Stadium,0,Royal Challengers Bangalore,Rajasthan Royals,3,2,46
1895,Narendra Modi Stadium,1,Rajasthan Royals,Royal Challengers Bangalore,3,4,67
1896,Narendra Modi Stadium,0,Rajasthan Royals,Gujarat Titans,3,4,44


In [44]:
np.setdiff1d(
    ['Arun Jaitley Stadium', 'Barsapara Cricket Stadium',
       'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium',
       'Eden Gardens', 'Himachal Pradesh Cricket Association Stadium',
       'M Chinnaswamy Stadium', 'MA Chidambaram Stadium',
       'Narendra Modi Stadium',
       'Punjab Cricket Association IS Bindra Stadium',
       'Rajiv Gandhi International Stadium', 'Sawai Mansingh Stadium',
       'Wankhede Stadium'], data.venue.unique()
)

array(['Barsapara Cricket Stadium',
       'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium'],
      dtype='<U60')

In [45]:
data.groupby(['venue']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Himachal Pradesh Cricket Association Stadium,18.0,40.555556,48.0
Sawai Mansingh Stadium,94.0,45.042553,55.0
Other,718.0,45.362117,53.0
Wankhede Stadium,208.0,45.480769,53.25
Rajiv Gandhi International Stadium,128.0,45.585938,54.25
M Chinnaswamy Stadium,156.0,46.025641,54.25
Narendra Modi Stadium,14.0,46.071429,48.25
MA Chidambaram Stadium,134.0,46.425373,53.75
Eden Gardens,158.0,46.56962,52.0
Arun Jaitley Stadium,155.0,47.832258,55.0


In [46]:
data.groupby(['batting_team']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
batting_team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lucknow Super Giants,15.0,44.666667,56.0
Royal Challengers Bangalore,224.0,44.852679,52.25
Rajasthan Royals,191.0,45.172775,53.0
Chennai Super Kings,208.0,45.221154,53.0
Mumbai Indians,231.0,45.480519,53.0
Kolkata Knight Riders,223.0,46.076233,53.0
Other,194.0,46.226804,55.0
Gujarat Titans,16.0,46.25,53.0
Delhi Capitals,223.0,46.609865,55.0
Sunrisers Hyderabad,152.0,47.118421,56.0


In [47]:
data.groupby(['count_batsmen']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
count_batsmen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,9.0,29.888889,32.0
6,59.0,34.847458,39.0
5,190.0,37.542105,44.75
4,499.0,42.679359,49.5
8,2.0,45.5,53.75
3,684.0,47.545322,54.25
2,452.0,52.442478,59.0


In [48]:
data.groupby(['count_bowlers']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
count_bowlers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,95.0,39.484211,47.0
3,767.0,43.615385,51.0
4,903.0,47.496124,55.0
5,124.0,53.451613,60.0
6,6.0,58.333333,60.0


In [49]:
tmp = data.groupby(['batting_team', 'venue']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean').reset_index()
tmp[tmp.batting_team == 'Gujarat Titans']

Unnamed: 0,batting_team,venue,count,mean,75%
0,Gujarat Titans,Narendra Modi Stadium,1.0,31.0,31.0
37,Gujarat Titans,Other,10.0,45.1,50.0
71,Gujarat Titans,Wankhede Stadium,4.0,48.5,54.5
98,Gujarat Titans,Eden Gardens,1.0,64.0,64.0


- ## Encoding of categorical inputs and feature scaling

In [50]:
data

Unnamed: 0,venue,innings,batting_team,bowling_team,count_batsmen,count_bowlers,total_runs
0,M Chinnaswamy Stadium,0,Kolkata Knight Riders,Royal Challengers Bangalore,3,3,61
1,M Chinnaswamy Stadium,1,Royal Challengers Bangalore,Kolkata Knight Riders,6,3,26
2,Punjab Cricket Association IS Bindra Stadium,0,Chennai Super Kings,Punjab Kings,3,3,53
3,Punjab Cricket Association IS Bindra Stadium,1,Punjab Kings,Chennai Super Kings,2,2,63
4,Arun Jaitley Stadium,0,Rajasthan Royals,Delhi Capitals,4,3,40
...,...,...,...,...,...,...,...
1893,Eden Gardens,1,Lucknow Super Giants,Royal Challengers Bangalore,4,3,62
1894,Narendra Modi Stadium,0,Royal Challengers Bangalore,Rajasthan Royals,3,2,46
1895,Narendra Modi Stadium,1,Rajasthan Royals,Royal Challengers Bangalore,3,4,67
1896,Narendra Modi Stadium,0,Rajasthan Royals,Gujarat Titans,3,4,44


In [51]:
data.nunique()

venue            11
innings           2
batting_team     11
bowling_team     11
count_batsmen     7
count_bowlers     5
total_runs       75
dtype: int64

In [52]:
pd.get_dummies(data)

Unnamed: 0,innings,count_batsmen,count_bowlers,total_runs,venue_Arun Jaitley Stadium,venue_Eden Gardens,venue_Himachal Pradesh Cricket Association Stadium,venue_M Chinnaswamy Stadium,venue_MA Chidambaram Stadium,venue_Narendra Modi Stadium,...,bowling_team_Delhi Capitals,bowling_team_Gujarat Titans,bowling_team_Kolkata Knight Riders,bowling_team_Lucknow Super Giants,bowling_team_Mumbai Indians,bowling_team_Other,bowling_team_Punjab Kings,bowling_team_Rajasthan Royals,bowling_team_Royal Challengers Bangalore,bowling_team_Sunrisers Hyderabad
0,0,3,3,61,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,6,3,26,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,3,3,53,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,2,2,63,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,4,3,40,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,1,4,3,62,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1894,0,3,2,46,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1895,1,3,4,67,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1896,0,3,4,44,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [53]:
X = data.iloc[:, :-1]
y = data["total_runs"]

Normalization scales the data to a range of 0 to 1, while standardization scales the data to have a mean of 0 and a standard deviation of 1.

In [54]:
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(sparse_output=False), ["venue", "batting_team", "bowling_team"]),
    ("scaler", StandardScaler(), ["count_batsmen", "count_bowlers"])
], remainder='passthrough')

In [55]:
X_preprocessed = preprocessor.fit_transform(X)

In [56]:
X_preprocessed.shape

(1895, 36)

In [57]:
X_preprocessed[0]

array([ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        , -0.31740491, -0.80500065,
        0.        ])

- ## Train-test split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size = 0.2)

In [59]:
y_test.shape

(379,)

In [60]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate(regressor, X_test, y_test):
    y_pred = np.round(
        regressor.predict(X_test)
    ).astype(int)
    
    # Calculate the mean absolute error (MAE)
    mae = mean_absolute_error(y_test, y_pred)
    print('MAE:', mae)

    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('RMSE:', rmse)

    # Calculate the R-squared score
    r2 = r2_score(y_test, y_pred)
    print('R-squared:', r2)
    
    print('Sum(|y_test - y_pred|):', np.abs(y_test - y_pred).sum())
        
    return pd.DataFrame(list(zip(y_test, y_pred)), columns=['Actual', 'Predicted'])

- ## Models

In [61]:
models = {}

In [62]:
from sklearn.ensemble import AdaBoostRegressor
models['AdaBoostRegressor'] = regressor = AdaBoostRegressor(
    learning_rate=1, loss='exponential', n_estimators=100
)
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 9.717678100263852
RMSE: 12.106497524961533
R-squared: -0.09487522323355546
Sum(|y_test - y_pred|): 3683


Unnamed: 0,Actual,Predicted
0,41,56
1,28,42
2,49,56
3,42,50
4,32,53
...,...,...
374,44,40
375,41,55
376,55,54
377,50,53


In [63]:
from sklearn.linear_model import LinearRegression
models['LinearRegression'] = regressor = LinearRegression()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 8.313984168865435
RMSE: 10.285758784508173
R-squared: 0.20968492995380883
Sum(|y_test - y_pred|): 3151


Unnamed: 0,Actual,Predicted
0,41,50
1,28,39
2,49,44
3,42,48
4,32,47
...,...,...
374,44,33
375,41,51
376,55,50
377,50,45


In [64]:
from sklearn.tree import DecisionTreeRegressor
models['DecisionTreeRegressor'] = regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 11.49868073878628
RMSE: 14.663688032069114
R-squared: -0.6062532438422901
Sum(|y_test - y_pred|): 4358


Unnamed: 0,Actual,Predicted
0,41,38
1,28,38
2,49,38
3,42,47
4,32,51
...,...,...
374,44,37
375,41,46
376,55,50
377,50,52


In [65]:
from sklearn.ensemble import RandomForestRegressor
models['RandomForestRegressor'] = regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 8.62532981530343
RMSE: 10.993281709483448
R-squared: 0.0972192145715216
Sum(|y_test - y_pred|): 3269


Unnamed: 0,Actual,Predicted
0,41,43
1,28,35
2,49,45
3,42,45
4,32,49
...,...,...
374,44,31
375,41,49
376,55,50
377,50,49


In [66]:
from sklearn.neighbors import KNeighborsRegressor
models['KNeighborsRegressor'] = regressor = KNeighborsRegressor()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 8.831134564643799
RMSE: 10.969975815522103
R-squared: 0.10104297005420015
Sum(|y_test - y_pred|): 3347


Unnamed: 0,Actual,Predicted
0,41,40
1,28,40
2,49,42
3,42,46
4,32,41
...,...,...
374,44,37
375,41,50
376,55,56
377,50,47


In [67]:
from sklearn.svm import SVR
models['SVR'] = regressor = SVR()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 8.131926121372032
RMSE: 10.141999719549673
R-squared: 0.2316222487796913
Sum(|y_test - y_pred|): 3082


Unnamed: 0,Actual,Predicted
0,41,48
1,28,38
2,49,47
3,42,47
4,32,49
...,...,...
374,44,35
375,41,49
376,55,50
377,50,46


In [68]:
import xgboost as xgb
models['XGBRegressor'] = regressor = xgb.XGBRegressor()
regressor.fit(X_train, y_train)
evaluate(regressor, X_test, y_test)

MAE: 9.100263852242744
RMSE: 11.651831251093443
R-squared: -0.014182156501153953
Sum(|y_test - y_pred|): 3449


Unnamed: 0,Actual,Predicted
0,41,39
1,28,37
2,49,42
3,42,47
4,32,48
...,...,...
374,44,33
375,41,59
376,55,53
377,50,47


- # Evaluation [using IPL-2023 dataset]

In [69]:
import os
files = os.listdir('./FilesUsed')
all_X = []
all_y = []
for file in files:
    if 'test_file_matchid' in file:
        match_no = file[-6:-4]
        
        if int(match_no) < 20: continue
            
        X_file_name = './FilesUsed/' + file
        y_file_name = './FilesUsed/' + 'test_file_labels_matchid_' + match_no + '.csv'
                
        X = pd.read_csv(X_file_name).drop(columns=['Unnamed: 0'])
        y = pd.read_csv(y_file_name)['actual_runs']
        
        all_X += [X]
        all_y += [y]
        
        print(match_no, X_file_name, y_file_name)
        
X_IPL23 = pd.concat(all_X, axis=0, ignore_index=True)
y_IPL23 = pd.concat(all_y, axis=0, ignore_index=True)

20 ./FilesUsed/test_file_matchid_20.csv ./FilesUsed/test_file_labels_matchid_20.csv
21 ./FilesUsed/test_file_matchid_21.csv ./FilesUsed/test_file_labels_matchid_21.csv
22 ./FilesUsed/test_file_matchid_22.csv ./FilesUsed/test_file_labels_matchid_22.csv
23 ./FilesUsed/test_file_matchid_23.csv ./FilesUsed/test_file_labels_matchid_23.csv
24 ./FilesUsed/test_file_matchid_24.csv ./FilesUsed/test_file_labels_matchid_24.csv
25 ./FilesUsed/test_file_matchid_25.csv ./FilesUsed/test_file_labels_matchid_25.csv
26 ./FilesUsed/test_file_matchid_26.csv ./FilesUsed/test_file_labels_matchid_26.csv
27 ./FilesUsed/test_file_matchid_27.csv ./FilesUsed/test_file_labels_matchid_27.csv
28 ./FilesUsed/test_file_matchid_28.csv ./FilesUsed/test_file_labels_matchid_28.csv
29 ./FilesUsed/test_file_matchid_29.csv ./FilesUsed/test_file_labels_matchid_29.csv
30 ./FilesUsed/test_file_matchid_30.csv ./FilesUsed/test_file_labels_matchid_30.csv
31 ./FilesUsed/test_file_matchid_31.csv ./FilesUsed/test_file_labels_matchid

In [70]:
len(all_X)

14

In [71]:
X_IPL23.innings = X_IPL23.innings.replace({1: 0, 2: 1})

# get count of batsmen & bowlers for each innings 
X_IPL23['count_batsmen'] = [len(x.split(",")) for x in X_IPL23['batsmen']]
X_IPL23['count_bowlers'] = [len(x.split(",")) for x in X_IPL23['bowlers']]
X_IPL23 = X_IPL23.drop(columns=['batsmen', 'bowlers'])[
    ['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers']
]

In [72]:
ambiguous_venues = np.setdiff1d(X_IPL23.venue.unique(), list(venue_mapping_normal.keys()))
ambiguous_venues_mapping = {}
for venue in ambiguous_venues:
    venue_kebab_case = to_kebab_case(venue)
    if venue_kebab_case in venue_mapping_kebab:
        ambiguous_venues_mapping[venue] = venue_mapping_kebab[venue_kebab_case]
    else:
        venue_lower = venue.lower()
        for tag in venue_mapping_tags:
            if tag in venue_lower: ambiguous_venues_mapping[venue] = venue_mapping_tags[tag]

venue_mapping_final = {**venue_mapping_normal, **ambiguous_venues_mapping}
np.setdiff1d(X_IPL23.venue.unique(), list(venue_mapping_final.keys()))

array([], dtype=object)

In [73]:
X_IPL23.venue = X_IPL23.venue.map(venue_mapping_final).fillna('Other').replace({
    'Barsapara Cricket Stadium': 'Other',
    'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium': 'Other'
})

In [74]:
X_IPL23

Unnamed: 0,venue,innings,batting_team,bowling_team,count_batsmen,count_bowlers
0,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Delhi Capitals,3,5
1,M Chinnaswamy Stadium,1,Delhi Capitals,Royal Challengers Bangalore,3,2
2,Other,0,Lucknow Super Giants,Punjab Kings,2,4
3,Other,1,Punjab Kings,Lucknow Super Giants,4,4
4,Wankhede Stadium,0,Kolkata Knight Riders,Mumbai Indians,4,4
5,Wankhede Stadium,1,Mumbai Indians,Kolkata Knight Riders,3,4
6,Narendra Modi Stadium,0,Gujarat Titans,Rajasthan Royals,4,4
7,Narendra Modi Stadium,1,Rajasthan Royals,Gujarat Titans,4,2
8,M Chinnaswamy Stadium,0,Chennai Super Kings,Royal Challengers Bangalore,3,3
9,M Chinnaswamy Stadium,1,Royal Challengers Bangalore,Chennai Super Kings,4,3


In [75]:
X_IPL23_preprocessed = preprocessor.transform(X_IPL23)

In [76]:
X_IPL23_preprocessed.shape

(28, 36)

In [77]:
X_IPL23_preprocessed[0]

array([ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.31740491,  2.03573722,
        0.        ])

In [78]:
evaluate(models['LinearRegression'], X_IPL23_preprocessed, y_IPL23)

MAE: 7.928571428571429
RMSE: 10.579630023236703
R-squared: 0.113857836751593
Sum(|y_test - y_pred|): 222.0


Unnamed: 0,Actual,Predicted
0,47.0,52
1,32.0,44
2,49.0,52
3,45.0,44
4,57.0,41
5,72.0,49
6,42.0,44
7,26.0,39
8,53.0,44
9,75.0,46


In [79]:
class ConstantRegressor:
    def __init__(self, n):
        self.n = n

    def predict(self, X):
        return np.repeat(self.n, X.shape[0])

In [80]:
evaluate(ConstantRegressor(40), X_IPL23_preprocessed, y_IPL23)

MAE: 12.178571428571429
RMSE: 14.972594011345242
R-squared: -0.7748290870166721
Sum(|y_test - y_pred|): 341.0


Unnamed: 0,Actual,Predicted
0,47.0,40
1,32.0,40
2,49.0,40
3,45.0,40
4,57.0,40
5,72.0,40
6,42.0,40
7,26.0,40
8,53.0,40
9,75.0,40


In [81]:
evaluate(ConstantRegressor(46), X_IPL23_preprocessed, y_IPL23)

MAE: 9.464285714285714
RMSE: 11.893875975235563
R-squared: -0.11997737990649004
Sum(|y_test - y_pred|): 265.0


Unnamed: 0,Actual,Predicted
0,47.0,46
1,32.0,46
2,49.0,46
3,45.0,46
4,57.0,46
5,72.0,46
6,42.0,46
7,26.0,46
8,53.0,46
9,75.0,46
