In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
ball_by_ball = pd.read_csv('./Data/IPL_Ball_by_Ball_2008_2022.csv')
matches_result = pd.read_csv('./Data/IPL_Matches_Result_2008_2022.csv')
ipl_2023_teams = pd.read_csv('./Data/Ipl_2023 _cricketers - Team name.csv').rename(columns={
    'Teams': 'team'
})
ipl_2023_venues = pd.read_csv('./Data/Ipl_2023 _cricketers - Venue.csv').rename(columns={
    'Venue': 'venue'
})

In [3]:
def log(*args):
    print('👉', *args)
    

In [4]:
def to_kebab_case(string):
    return '-'.join(
        string.replace(",", "").replace(".", "").split()
    ).lower()

# Preprocessing 

- ## Change column names, drop unnecessary columns [in ball_by_ball, matches_result]

In [5]:
ball_by_ball_orig = ball_by_ball

ball_by_ball = ball_by_ball.rename(columns={
    'ID': 'match_id',
    'ballnumber': 'ball_number',
    'non-striker': 'non_striker',
    'BattingTeam': 'batting_team',
}).loc[:, [
    'match_id',
    'innings',
    'batting_team',
    'overs',
    'ball_number',
    'batter',
    'bowler',
    'total_run',
]]

In [6]:
matches_result_orig = matches_result

matches_result = matches_result.rename(columns={
    'ID': 'match_id',
    'Team1': 'team_1',
    'Team2': 'team_2',
    'Venue': 'venue',
}).loc[:, [
    'match_id',
    'team_1',
    'team_2',
    'venue',
]]

In [7]:
print(ball_by_ball_orig.shape)
ball_by_ball_orig.head()

(225954, 17)


Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals


In [8]:
print(matches_result_orig.shape)
matches_result_orig.head()

(950, 20)


Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan


In [9]:
print(ball_by_ball.shape)
ball_by_ball.head()

(225954, 8)


Unnamed: 0,match_id,innings,batting_team,overs,ball_number,batter,bowler,total_run
0,1312200,1,Rajasthan Royals,0,1,YBK Jaiswal,Mohammed Shami,0
1,1312200,1,Rajasthan Royals,0,2,YBK Jaiswal,Mohammed Shami,1
2,1312200,1,Rajasthan Royals,0,3,JC Buttler,Mohammed Shami,1
3,1312200,1,Rajasthan Royals,0,4,YBK Jaiswal,Mohammed Shami,0
4,1312200,1,Rajasthan Royals,0,5,YBK Jaiswal,Mohammed Shami,0


In [10]:
print(matches_result.shape)
matches_result.head()

(950, 4)


Unnamed: 0,match_id,team_1,team_2,venue
0,1312200,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad"
1,1312199,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad"
2,1312198,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata"
3,1312197,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata"
4,1304116,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai"


In [11]:
log('match_id.nunique:', ball_by_ball.match_id.nunique())
log('batting_team.nunique:', ball_by_ball.batting_team.nunique())
log('union1d(batter, bowler).shape:', np.union1d(
    ball_by_ball.batter.unique(), ball_by_ball.bowler.unique()
).shape)
log('innings.unique:', ball_by_ball.innings.unique())
log('overs.unique:', ball_by_ball.overs.unique())

👉 match_id.nunique: 950
👉 batting_team.nunique: 18
👉 union1d(batter, bowler).shape: (652,)
👉 innings.unique: [1 2 3 4 5 6]
👉 overs.unique: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [12]:
log('match_id.nunique:', matches_result.match_id.nunique())
log('venue.nunique:', matches_result.venue.nunique())
log('union1d(team_1, team_2).shape:', np.union1d(
    matches_result.team_1.unique(), matches_result.team_2.unique()
).shape)

👉 match_id.nunique: 950
👉 venue.nunique: 49
👉 union1d(team_1, team_2).shape: (18,)


- ## Get Venues Mapping

In [13]:
matches_result_orig.groupby(['City', 'Venue'], dropna=False)['Venue'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq
City,Venue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abu Dhabi,Sheikh Zayed Stadium,29,1,Sheikh Zayed Stadium,29
Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",8,1,"Zayed Cricket Stadium, Abu Dhabi",8
Ahmedabad,"Narendra Modi Stadium, Ahmedabad",7,1,"Narendra Modi Stadium, Ahmedabad",7
Ahmedabad,"Sardar Patel Stadium, Motera",12,1,"Sardar Patel Stadium, Motera",12
Bangalore,M Chinnaswamy Stadium,65,1,M Chinnaswamy Stadium,65
Bengaluru,M.Chinnaswamy Stadium,15,1,M.Chinnaswamy Stadium,15
Bloemfontein,OUTsurance Oval,2,1,OUTsurance Oval,2
Cape Town,Newlands,7,1,Newlands,7
Centurion,SuperSport Park,12,1,SuperSport Park,12
Chandigarh,Punjab Cricket Association IS Bindra Stadium,10,1,Punjab Cricket Association IS Bindra Stadium,10


👇: https://www.iplt20.com/matches/schedule/men

In [14]:
venue_mapping_normal = {
  "Arun Jaitley Stadium": "Arun Jaitley Stadium",
  "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
  "Feroz Shah Kotla": "Arun Jaitley Stadium",
  "Barsapara Cricket Stadium": "Barsapara Cricket Stadium",
  "Barsapara Cricket Stadium, Guwahati": "Barsapara Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Eden Gardens": "Eden Gardens",
  "Eden Gardens, Kolkata": "Eden Gardens",
  "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium",
  "Himachal Pradesh Cricket Association Stadium, Dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "M Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "MA Chidambaram Stadium": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chennai": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
  "Narendra Modi Stadium": "Narendra Modi Stadium",
  "Narendra Modi Stadium, Ahmedabad": "Narendra Modi Stadium",
  "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Hyderabad": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
  "Sawai Mansingh Stadium": "Sawai Mansingh Stadium",
  "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
  "Wankhede Stadium": "Wankhede Stadium",
  "Wankhede Stadium, Mumbai": "Wankhede Stadium"
}

In [15]:
venue_mapping_kebab = {
  "arun-jaitley-stadium": "Arun Jaitley Stadium",
  "arun-jaitley-stadium-delhi": "Arun Jaitley Stadium",
  "feroz-shah-kotla": "Arun Jaitley Stadium",
  "barsapara-cricket-stadium": "Barsapara Cricket Stadium",
  "barsapara-cricket-stadium-guwahati": "Barsapara Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium-lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "eden-gardens": "Eden Gardens",
  "eden-gardens-kolkata": "Eden Gardens",
  "himachal-pradesh-cricket-association-stadium": "Himachal Pradesh Cricket Association Stadium",
  "himachal-pradesh-cricket-association-stadium-dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "m-chinnaswamy-stadium": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "ma-chidambaram-stadium": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chennai": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk-chennai": "MA Chidambaram Stadium",
  "narendra-modi-stadium": "Narendra Modi Stadium",
  "narendra-modi-stadium-ahmedabad": "Narendra Modi Stadium",
  "punjab-cricket-association-is-bindra-stadium": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-is-bindra-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "rajiv-gandhi-international-stadium": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-uppal": "Rajiv Gandhi International Stadium",
  "sawai-mansingh-stadium": "Sawai Mansingh Stadium",
  "sawai-mansingh-stadium-jaipur": "Sawai Mansingh Stadium",
  "wankhede-stadium": "Wankhede Stadium",
  "wankhede-stadium-mumbai": "Wankhede Stadium"
}

In [16]:
np.setdiff1d(matches_result.venue.unique(), list(venue_mapping_normal.keys()))

array(['Barabati Stadium', 'Brabourne Stadium',
       'Brabourne Stadium, Mumbai', 'Buffalo Park',
       'De Beers Diamond Oval', 'Dr DY Patil Sports Academy',
       'Dr DY Patil Sports Academy, Mumbai',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Dubai International Cricket Stadium', 'Green Park',
       'Holkar Cricket Stadium', 'JSCA International Stadium Complex',
       'Kingsmead', 'Maharashtra Cricket Association Stadium',
       'Maharashtra Cricket Association Stadium, Pune', 'Nehru Stadium',
       'New Wanderers Stadium', 'Newlands', 'OUTsurance Oval',
       'Sardar Patel Stadium, Motera',
       'Saurashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'Sharjah Cricket Stadium', 'Sheikh Zayed Stadium',
       "St George's Park", 'Subrata Roy Sahara Stadium',
       'SuperSport Park', 'Vidarbha Cricket Association Stadium, Jamtha',
       'Zayed Cricket Stadium, Abu Dhabi'], dtype=object)

- ## Get Teams Mapping

In [17]:
set(matches_result['team_1'].unique()) == set(matches_result['team_2'].unique()) == set(ball_by_ball['batting_team'].unique())

True

In [18]:
# Rajasthan Royals
# Gujarat Titans
# Royal Challengers Bangalore
# Lucknow Super Giants
# Sunrisers Hyderabad
# Punjab Kings [Kings XI Punjab]
# Delhi Capitals [Delhi Daredevils]
# Mumbai Indians
# Chennai Super Kings
# Kolkata Knight Riders

team_mapping = { # 10 teams
 'Rajasthan Royals': 'Rajasthan Royals',
 'Gujarat Titans': 'Gujarat Titans',
 'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Lucknow Super Giants': 'Lucknow Super Giants',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
    
 'Kings XI Punjab': 'Punjab Kings',
 'Punjab Kings': 'Punjab Kings',
    
 'Delhi Daredevils': 'Delhi Capitals',
 'Delhi Capitals': 'Delhi Capitals',
}

In [19]:
print(np.setdiff1d(
   list(team_mapping.keys()), matches_result['team_1'].unique()
))

print(np.setdiff1d(
    matches_result['team_1'].unique(), list(team_mapping.keys())
))

[]
['Deccan Chargers' 'Gujarat Lions' 'Kochi Tuskers Kerala' 'Pune Warriors'
 'Rising Pune Supergiant' 'Rising Pune Supergiants']


- ## Apply Venues/Teams Mapping [in matches_result, ball_by_ball]

In [20]:
matches_result.venue = matches_result.venue.map(venue_mapping_normal).fillna('Other')

matches_result.team_1 = matches_result.team_1.map(team_mapping).fillna('Other')
matches_result.team_2 = matches_result.team_2.map(team_mapping).fillna('Other')

ball_by_ball.batting_team = ball_by_ball.batting_team.map(team_mapping).fillna('Other')

In [21]:
matches_result.venue[matches_result.venue == 'Other'].shape

(359,)

In [22]:
print(matches_result.team_1[matches_result.team_1 == 'Other'].shape)
print(matches_result.team_2[matches_result.team_2 == 'Other'].shape)

(99,)
(96,)


In [23]:
ball_by_ball.batting_team[ball_by_ball.batting_team == 'Other'].shape

(23105,)

In [24]:
print(matches_result.shape)
print(ball_by_ball.shape)

(950, 4)
(225954, 8)


- ## Remove NA Teams [in ball_by_ball] and Venues [in matches_result]

In [25]:
# matches_result = matches_result.dropna(subset=['team_1', 'team_2', 'venue'])
# matches_result.shape

# ball_by_ball = ball_by_ball.dropna(subset=['batting_team'])
# print(ball_by_ball.shape)

- ## Select first 6 overs, Select innings 1 & 2, Map innings (1,2) to (0,1) [in ball_by_ball]

In [26]:
ball_by_ball.innings.unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [27]:
ball_by_ball.overs.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [28]:
ball_by_ball = ball_by_ball.loc[(ball_by_ball.overs <= 5) & (ball_by_ball.innings <= 2)]
ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})
ball_by_ball.shape

(70921, 8)

In [29]:
ball_by_ball.innings.unique()

array([0, 1], dtype=int64)

In [30]:
ball_by_ball.overs.unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

- ## Grouping 

In [31]:
ball_by_ball_gb = ball_by_ball.groupby(['match_id', 'innings', 'batting_team'])

In [32]:
total_runs = ball_by_ball_gb['total_run'].sum()
batsmen = ball_by_ball_gb['batter'].unique()
bowlers = ball_by_ball_gb['bowler'].unique()

In [33]:
total_runs = total_runs.to_frame(name = 'total_runs').reset_index()
batsmen = batsmen.to_frame(name = 'batsmen').reset_index()
bowlers = bowlers.to_frame(name = 'bowlers').reset_index()

In [34]:
data = total_runs.merge(batsmen, how='right', on=['match_id','innings','batting_team'])
data = data.merge(bowlers, how='right', on=['match_id','innings','batting_team'])
data = data.merge(matches_result, on=['match_id'])

In [35]:
mask = data['batting_team'] == data['team_1']
data.loc[mask, 'bowling_team'] = data['team_2']
data.loc[~mask, 'bowling_team'] = data['team_1']

In [36]:
# match_id == 829763, data for one innings is missing
# match_id == 829813, total_runs for one innings is 2 (probably a mistake in data entry)
data = data.drop(data[(data['match_id'] == 829763) | (data['match_id'] == 829813)].index)

In [37]:
data['count_batsmen'] = [len(x) for x in data['batsmen']]
data['count_bowlers'] = [len(x) for x in data['bowlers']]

In [95]:
data = data[
    ['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers', 'total_runs']
].reset_index()

# EDA

In [39]:
data

Unnamed: 0,venue,innings,batting_team,bowling_team,count_batsmen,count_bowlers,total_runs
0,M Chinnaswamy Stadium,0,Kolkata Knight Riders,Royal Challengers Bangalore,3,3,61
1,M Chinnaswamy Stadium,1,Royal Challengers Bangalore,Kolkata Knight Riders,6,3,26
2,Punjab Cricket Association IS Bindra Stadium,0,Chennai Super Kings,Punjab Kings,3,3,53
3,Punjab Cricket Association IS Bindra Stadium,1,Punjab Kings,Chennai Super Kings,2,2,63
4,Arun Jaitley Stadium,0,Rajasthan Royals,Delhi Capitals,4,3,40
...,...,...,...,...,...,...,...
1893,Eden Gardens,1,Lucknow Super Giants,Royal Challengers Bangalore,4,3,62
1894,Narendra Modi Stadium,0,Royal Challengers Bangalore,Rajasthan Royals,3,2,46
1895,Narendra Modi Stadium,1,Rajasthan Royals,Royal Challengers Bangalore,3,4,67
1896,Narendra Modi Stadium,0,Rajasthan Royals,Gujarat Titans,3,4,44


In [40]:
data.groupby(['venue']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Himachal Pradesh Cricket Association Stadium,18.0,40.555556,48.0
Sawai Mansingh Stadium,94.0,45.042553,55.0
Other,718.0,45.362117,53.0
Wankhede Stadium,208.0,45.480769,53.25
Rajiv Gandhi International Stadium,128.0,45.585938,54.25
M Chinnaswamy Stadium,156.0,46.025641,54.25
Narendra Modi Stadium,14.0,46.071429,48.25
MA Chidambaram Stadium,134.0,46.425373,53.75
Eden Gardens,158.0,46.56962,52.0
Arun Jaitley Stadium,155.0,47.832258,55.0


In [41]:
data.groupby(['batting_team']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
batting_team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lucknow Super Giants,15.0,44.666667,56.0
Royal Challengers Bangalore,224.0,44.852679,52.25
Rajasthan Royals,191.0,45.172775,53.0
Chennai Super Kings,208.0,45.221154,53.0
Mumbai Indians,231.0,45.480519,53.0
Kolkata Knight Riders,223.0,46.076233,53.0
Other,194.0,46.226804,55.0
Gujarat Titans,16.0,46.25,53.0
Delhi Capitals,223.0,46.609865,55.0
Sunrisers Hyderabad,152.0,47.118421,56.0


In [42]:
data.groupby(['count_batsmen']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
count_batsmen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,9.0,29.888889,32.0
6,59.0,34.847458,39.0
5,190.0,37.542105,44.75
4,499.0,42.679359,49.5
8,2.0,45.5,53.75
3,684.0,47.545322,54.25
2,452.0,52.442478,59.0


In [43]:
data.groupby(['count_bowlers']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean')

Unnamed: 0_level_0,count,mean,75%
count_bowlers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,95.0,39.484211,47.0
3,767.0,43.615385,51.0
4,903.0,47.496124,55.0
5,124.0,53.451613,60.0
6,6.0,58.333333,60.0


In [44]:
tmp = data.groupby(['batting_team', 'venue']).total_runs.describe()[['count', 'mean', '75%']].sort_values(by='mean').reset_index()

tmp[tmp.batting_team == 'Gujarat Titans']
tmp[tmp.batting_team == 'Mumbai Indians']
tmp[tmp.batting_team == 'Chennai Super Kings']

Unnamed: 0,batting_team,venue,count,mean,75%
3,Chennai Super Kings,Himachal Pradesh Cricket Association Stadium,2.0,37.0,43.0
10,Chennai Super Kings,Eden Gardens,11.0,39.818182,47.5
15,Chennai Super Kings,Sawai Mansingh Stadium,6.0,40.833333,50.75
16,Chennai Super Kings,M Chinnaswamy Stadium,9.0,41.111111,55.0
27,Chennai Super Kings,Wankhede Stadium,23.0,43.652174,49.0
29,Chennai Super Kings,Punjab Cricket Association IS Bindra Stadium,6.0,44.0,51.5
35,Chennai Super Kings,Rajiv Gandhi International Stadium,6.0,44.833333,52.25
50,Chennai Super Kings,MA Chidambaram Stadium,56.0,46.107143,53.25
54,Chennai Super Kings,Other,79.0,46.392405,53.5
79,Chennai Super Kings,Arun Jaitley Stadium,10.0,49.5,54.5


- ## Encoding of categorical inputs and feature scaling

In [97]:
X = data.iloc[:, :-1]
y = data["total_runs"]

In [104]:
ct = ColumnTransformer(transformers = [
    ('ohe', OneHotEncoder(categories = "auto", drop='first', sparse_output=False), ['venue', 'batting_team', 'bowling_team'])
], remainder = 'passthrough')

scaler = StandardScaler()

X_ohe = pd.DataFrame(ct.fit_transform(X))
X_std = scaler.fit_transform(X_ohe)

In [107]:
X_std[0]

array([-0.30159812, -0.09792738,  3.33877761, -0.27584983, -0.08627195,
       -0.78104128, -0.25063016, -0.26914524, -0.22845837, -0.351135  ,
       -0.36520297, -0.09227767,  2.7382034 , -0.0893237 , -0.3725884 ,
       -0.33771372, -0.36054686, -0.33479725, -0.3661304 , -0.29530656,
       -0.36427429, -0.09227767, -0.36520297, -0.0893237 , -0.3725884 ,
       -0.33868257, -0.36054686, -0.33479725,  2.73126737, -0.29530656,
       -1.73113704, -0.99947243, -0.31740491, -0.80500065])

- ## Train-test split

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2)

In [66]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate(regressor):
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    # Calculate the mean absolute error (MAE)
    mae = mean_absolute_error(y_test, y_pred)
    print('MAE:', mae)

    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('RMSE:', rmse)

    # Calculate the R-squared score
    r2 = r2_score(y_test, y_pred)
    print('R-squared:', r2)

- ## Models

In [91]:
from sklearn.ensemble import AdaBoostRegressor
regressor = AdaBoostRegressor(
    learning_rate=1, loss='exponential', n_estimators=100, random_state=42
)
evaluate(regressor)

MAE: 9.39286858094682
RMSE: 11.579727425802616
R-squared: -0.02445611361176292


In [68]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
evaluate(regressor)

MAE: 8.162311078501537
RMSE: 10.200637321399416
R-squared: 0.20502897733178838


In [69]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
evaluate(regressor)

MAE: 11.436235708003517
RMSE: 14.671138480349322
R-squared: -0.6444638009448485


In [70]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
evaluate(regressor)

MAE: 8.708739445910291
RMSE: 11.297779262992545
R-squared: 0.02482433415093066


In [71]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
evaluate(regressor)

MAE: 9.398944591029023
RMSE: 11.907351224878358
R-squared: -0.08324579807685017


In [72]:
from sklearn.svm import SVR
regressor = SVR()
evaluate(regressor)

MAE: 8.290175147306135
RMSE: 10.448696458621347
R-squared: 0.16589464233478857


In [55]:
import xgboost as xgb
regressor = xgb.XGBRegressor()
evaluate(regressor)

MAE: 9.369646681330135
RMSE: 12.137098051333277
R-squared: 0.054318648159237815


In [76]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model architecture
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])

# Fit the model to the training data
history = model.fit(X_train, y_train, epochs=200, batch_size=128, verbose=False)

# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test)

# Print the test loss
print('Test loss:', test_loss)

Test loss: [9.338618278503418, 9.338618278503418]


In [57]:
# import tensorflow as tf
# from tensorflow.keras import layers, models

# # Define a matrix of hyperparameters to test
# params = {
#     'batch_size': [16, 32],
#     'epochs': [50, 100],
#     'learning_rate': [0.001, 0.01]
# }

# # Define the model architecture
# def build_model(learning_rate=0.001):
#     model = models.Sequential([
#         layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#         layers.Dense(32, activation='relu'),
#         layers.Dense(1)
#     ])
#     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
#     return model

# # Loop through the hyperparameter matrix and fit the model for each combination
# for batch_size in params['batch_size']:
#     for epochs in params['epochs']:
#         for learning_rate in params['learning_rate']:
#             print(f"Fitting model with batch_size={batch_size}, epochs={epochs}, learning_rate={learning_rate}")
#             model = build_model(learning_rate=learning_rate)
#             history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)
#             test_loss, test_mae = model.evaluate(X_test, y_test)
#             print(f"Test loss: {test_loss}, Test MAE: {test_mae}")
