# IPL EDA & Deep learning Transformers

## Step 1: Data Loading

In [140]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [141]:
# Define the file path
balls_file_path = "/content/drive/MyDrive/IPL/ball_by_ball.csv"
match_file_path="/content/drive/MyDrive/IPL/matches.csv"

# Load the CSV file
ball_by_ball = pd.read_csv(balls_file_path)
matches = pd.read_csv(match_file_path)

In [142]:
ball_by_ball.head()

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,ExtraType,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam
0,1082591,1,0,1,DA Warner,TS Mills,S Dhawan,,0,0,0,0,0,,,,Sunrisers Hyderabad
1,1082591,1,0,2,DA Warner,TS Mills,S Dhawan,,0,0,0,0,0,,,,Sunrisers Hyderabad
2,1082591,1,0,3,DA Warner,TS Mills,S Dhawan,,4,0,4,0,0,,,,Sunrisers Hyderabad
3,1082591,1,0,4,DA Warner,TS Mills,S Dhawan,,0,0,0,0,0,,,,Sunrisers Hyderabad
4,1082591,1,0,5,DA Warner,TS Mills,S Dhawan,wides,0,2,2,0,0,,,,Sunrisers Hyderabad


In [143]:
matches.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1082591,Hyderabad,2017-04-05,2017,1,Sunrisers Hyderabad,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,N,Sunrisers Hyderabad,Runs,35,Yuvraj Singh,"['DA Warner', 'S Dhawan', 'MC Henriques', 'Yuv...","['CH Gayle', 'Mandeep Singh', 'TM Head', 'KM J...",AY Dandekar,NJ Llong
1,1082592,Pune,2017-04-06,2017,2,Rising Pune Supergiant,Mumbai Indians,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,field,N,Rising Pune Supergiant,Wickets,7,SPD Smith,"['AM Rahane', 'MA Agarwal', 'SPD Smith', 'BA S...","['PA Patel', 'JC Buttler', 'RG Sharma', 'N Ran...",A Nand Kishore,S Ravi
2,1082593,Rajkot,2017-04-07,2017,3,Gujarat Lions,Kolkata Knight Riders,Saurashtra Cricket Association Stadium,Kolkata Knight Riders,field,N,Kolkata Knight Riders,Wickets,10,CA Lynn,"['JJ Roy', 'BB McCullum', 'SK Raina', 'AJ Finc...","['G Gambhir', 'CA Lynn', 'RV Uthappa', 'MK Pan...",Nitin Menon,CK Nandan
3,1082594,Indore,2017-04-08,2017,4,Kings XI Punjab,Rising Pune Supergiant,Holkar Cricket Stadium,Kings XI Punjab,field,N,Kings XI Punjab,Wickets,6,GJ Maxwell,"['HM Amla', 'M Vohra', 'WP Saha', 'AR Patel', ...","['AM Rahane', 'MA Agarwal', 'SPD Smith', 'BA S...",AK Chaudhary,C Shamshuddin
4,1082595,Bengaluru,2017-04-08,2017,5,Royal Challengers Bangalore,Delhi Daredevils,M.Chinnaswamy Stadium,Royal Challengers Bangalore,bat,N,Royal Challengers Bangalore,Runs,15,KM Jadhav,"['CH Gayle', 'SR Watson', 'Mandeep Singh', 'KM...","['AP Tare', 'SW Billings', 'KK Nair', 'SV Sams...",S Ravi,VK Sharma


## Step 2: Data Preprocessing

Sorting matches dataset and ball_by_ball dataset by season

In [144]:
matches = matches.sort_values(by=['Season', 'ID']).reset_index(drop=True)
ball_by_ball = ball_by_ball.sort_values(by=['ID', 'Innings', 'Overs', 'BallNumber']).reset_index(drop=True)

Handle Missing Values

In [145]:
print(matches.isnull().sum())
print(ball_by_ball.isnull().sum())

ID                 0
City               0
Date               0
Season             0
MatchNumber        0
Team1              0
Team2              0
Venue              0
TossWinner         0
TossDecision       0
SuperOver          0
WinningTeam        0
WonBy              0
Margin             0
Player_of_Match    5
Team1Players       0
Team2Players       0
Umpire1            0
Umpire2            0
dtype: int64
ID                       0
Innings                  0
Overs                    0
BallNumber               0
Batter                   0
Bowler                   0
NonStriker               0
ExtraType           246795
BatsmanRun               0
ExtrasRun                0
TotalRun                 0
NonBoundary              0
IsWicketDelivery         0
PlayerOut           247970
Kind                247970
FieldersInvolved    251566
BattingTeam              0
dtype: int64


In [146]:
matches.update(matches[['Player_of_Match']].fillna('Unknown'))
ball_by_ball.update(ball_by_ball[['ExtraType']].fillna('NoExtra'))
ball_by_ball.update(ball_by_ball[['PlayerOut']].fillna('NotOut'))
ball_by_ball.update(ball_by_ball[['Kind']].fillna('None'))
ball_by_ball.update(ball_by_ball[['FieldersInvolved']].fillna('None'))

In [147]:
print(matches.isnull().sum())
print(ball_by_ball.isnull().sum())

ID                 0
City               0
Date               0
Season             0
MatchNumber        0
Team1              0
Team2              0
Venue              0
TossWinner         0
TossDecision       0
SuperOver          0
WinningTeam        0
WonBy              0
Margin             0
Player_of_Match    0
Team1Players       0
Team2Players       0
Umpire1            0
Umpire2            0
dtype: int64
ID                  0
Innings             0
Overs               0
BallNumber          0
Batter              0
Bowler              0
NonStriker          0
ExtraType           0
BatsmanRun          0
ExtrasRun           0
TotalRun            0
NonBoundary         0
IsWicketDelivery    0
PlayerOut           0
Kind                0
FieldersInvolved    0
BattingTeam         0
dtype: int64


Create a mapping of seasons to sequential numbers and replace Season column with mapped values

In [148]:
season_mapping = {
    '2007/08': 1, '2009': 2, '2009/10': 3, '2011': 4, '2012': 5, '2013': 6,
    '2014': 7, '2015': 8, '2016': 9, '2017': 10, '2018': 11, '2019': 12,
    '2020/21': 13, '2021': 14, '2022': 15, '2023': 16, '2024': 17
}
matches['Season'] = matches['Season'].map(season_mapping)
print(matches['Season'].unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


Remove duplicates if any

In [149]:
matches.drop_duplicates(inplace=True)
ball_by_ball.drop_duplicates(inplace=True)

Convert the Date column to datetime format

In [150]:
matches['Date'] = pd.to_datetime(matches['Date'])

Define a dictionary for team name replacements

In [151]:
team_name_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Rising Pune Supergiant': 'Rising Pune Supergiants'
}

# Apply replacements
matches['Team1'] = matches['Team1'].replace(team_name_mapping)
matches['Team2'] = matches['Team2'].replace(team_name_mapping)
matches['WinningTeam'] = matches['WinningTeam'].replace(team_name_mapping)
matches['TossWinner'] = matches['TossWinner'].replace(team_name_mapping)
ball_by_ball['BattingTeam'] = ball_by_ball['BattingTeam'].replace(team_name_mapping)

Merge ball-by-ball data with match data on 'ID'

In [152]:
merged_data = ball_by_ball.merge(matches, on='ID', how='inner')

In [153]:
print(merged_data.columns)
print(f"Total Rows in merged_data: {len(merged_data)}")
pd.set_option('display.max_columns', None)
merged_data.head()

Index(['ID', 'Innings', 'Overs', 'BallNumber', 'Batter', 'Bowler',
       'NonStriker', 'ExtraType', 'BatsmanRun', 'ExtrasRun', 'TotalRun',
       'NonBoundary', 'IsWicketDelivery', 'PlayerOut', 'Kind',
       'FieldersInvolved', 'BattingTeam', 'City', 'Date', 'Season',
       'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'Player_of_Match',
       'Team1Players', 'Team2Players', 'Umpire1', 'Umpire2'],
      dtype='object')
Total Rows in merged_data: 260920


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,ExtraType,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,legbyes,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,Bangalore,2008-04-18,1,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,NoExtra,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,Bangalore,2008-04-18,1,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,wides,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,Bangalore,2008-04-18,1,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,NoExtra,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,Bangalore,2008-04-18,1,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,NoExtra,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,Bangalore,2008-04-18,1,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen


## Feature Engineering for IPL Prediction Model

### Match Context Feature Engineering

In [154]:
columns_to_drop = ["City", "Date", "MatchNumber", "Umpire1", "Umpire2", "ExtraType"]
merged_data.drop(columns=columns_to_drop, inplace=True)
# Display updated columns
print("\nUpdated Columns after dropping unnecessary ones:")
print(merged_data.columns)
merged_data.head()


Updated Columns after dropping unnecessary ones:
Index(['ID', 'Innings', 'Overs', 'BallNumber', 'Batter', 'Bowler',
       'NonStriker', 'BatsmanRun', 'ExtrasRun', 'TotalRun', 'NonBoundary',
       'IsWicketDelivery', 'PlayerOut', 'Kind', 'FieldersInvolved',
       'BattingTeam', 'Season', 'Team1', 'Team2', 'Venue', 'TossWinner',
       'TossDecision', 'SuperOver', 'WinningTeam', 'WonBy', 'Margin',
       'Player_of_Match', 'Team1Players', 'Team2Players'],
      dtype='object')


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."


In [155]:
match_total_runs = merged_data.groupby(['ID', 'BattingTeam'])['BatsmanRun'].sum().to_frame(name="TeamTotalRuns")
merged_data = merged_data.merge(match_total_runs, on=['ID', 'BattingTeam'], how='left')
print(merged_data.columns)
venue_avg_runs = merged_data.groupby('Venue')['TeamTotalRuns'].mean().rename("VenueAvgRuns")
print(venue_avg_runs)
merged_data.head()

Index(['ID', 'Innings', 'Overs', 'BallNumber', 'Batter', 'Bowler',
       'NonStriker', 'BatsmanRun', 'ExtrasRun', 'TotalRun', 'NonBoundary',
       'IsWicketDelivery', 'PlayerOut', 'Kind', 'FieldersInvolved',
       'BattingTeam', 'Season', 'Team1', 'Team2', 'Venue', 'TossWinner',
       'TossDecision', 'SuperOver', 'WinningTeam', 'WonBy', 'Margin',
       'Player_of_Match', 'Team1Players', 'Team2Players', 'TeamTotalRuns'],
      dtype='object')
Venue
Arun Jaitley Stadium                                                     155.768474
Arun Jaitley Stadium, Delhi                                              181.742367
Barabati Stadium                                                         157.056637
Barsapara Cricket Stadium, Guwahati                                      163.633288
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow    151.274444
Brabourne Stadium                                                        163.487647
Brabourne Stadium, Mumbai              

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205


In [156]:
merged_data.head()

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205


In [157]:
def calculate_win_percentage(df):
    total_wins = df['WinningTeam'].value_counts()
    total_played = df['Team1'].value_counts().add(df['Team2'].value_counts(), fill_value=0)
    win_percentage = (total_wins / total_played).fillna(0) * 100
    return win_percentage.rename("WinPercentage")

def calculate_toss_impact(df):
    df['TossImpact'] = (df['TossWinner'] == df['WinningTeam']).astype(int)
    return df

win_percentage = calculate_win_percentage(merged_data)

merged_data = merged_data.merge(win_percentage.rename("Team1WinPercentage"), left_on='Team1', right_index=True, how='left')
merged_data = merged_data.merge(win_percentage.rename("Team2WinPercentage"), left_on='Team2', right_index=True, how='left')

merged_data = merged_data.merge(venue_avg_runs, on='Venue', how='left')

merged_data = calculate_toss_impact(merged_data)

merged_data['IsSuperOver'] = (merged_data['SuperOver'] == 'Y').astype(int)

merged_data.head()

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0


### Ball-by-Ball Feature Engineering

#### Batters

In [158]:
merged_data = merged_data.sort_values(by=['ID', 'Innings', 'Overs', 'BallNumber'])

merged_data['BatsmanCumulativeRuns'] = merged_data.groupby(['ID', 'Batter'])['BatsmanRun'].cumsum()

In [159]:
def calculate_batsman_strike_rate(df):
    df = df.sort_values(by=['ID', 'Batter', 'Overs', 'BallNumber'])
    df['BatsmanBallsFaced'] = df.groupby(['ID', 'Batter']).cumcount() + 1
    df['BatsmanStrikeRate'] = (df['BatsmanCumulativeRuns'] * 100 / df['BatsmanBallsFaced']).fillna(0)
    return df

merged_data = calculate_batsman_strike_rate(merged_data)
merged_data.head(5)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate
178,335982,2,8,3,AA Noffke,AB Agarkar,CL White,0,0,0,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,1,0.0
179,335982,2,8,4,AA Noffke,AB Agarkar,CL White,0,1,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,2,0.0
180,335982,2,8,5,AA Noffke,AB Agarkar,CL White,0,1,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,3,0.0
181,335982,2,8,6,AA Noffke,AB Agarkar,CL White,1,0,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,1,4,25.0
184,335982,2,9,1,AA Noffke,SC Ganguly,P Kumar,1,0,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,2,5,40.0


In [160]:
# Calculate Batsman Boundaries Count (Cumulative 4s and 6s)
merged_data['BatsmanBoundaries'] = merged_data['BatsmanRun'].apply(lambda x: 1 if x in [4, 6] else 0)
merged_data['BatsmanCumulativeBoundaries'] = merged_data.groupby(['ID', 'Batter'])['BatsmanBoundaries'].cumsum()
merged_data.drop(columns=['BatsmanBoundaries'], inplace=True)

merged_data.head(10)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries
178,335982,2,8,3,AA Noffke,AB Agarkar,CL White,0,0,0,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,1,0.0,0
179,335982,2,8,4,AA Noffke,AB Agarkar,CL White,0,1,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,2,0.0,0
180,335982,2,8,5,AA Noffke,AB Agarkar,CL White,0,1,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,0,3,0.0,0
181,335982,2,8,6,AA Noffke,AB Agarkar,CL White,1,0,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,1,4,25.0,0
184,335982,2,9,1,AA Noffke,SC Ganguly,P Kumar,1,0,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,2,5,40.0,0
189,335982,2,9,6,AA Noffke,SC Ganguly,P Kumar,4,0,4,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,6,6,100.0,1
190,335982,2,9,7,AA Noffke,SC Ganguly,P Kumar,1,0,1,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,7,7,100.0,1
191,335982,2,10,1,AA Noffke,AB Agarkar,P Kumar,0,0,0,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,7,8,87.5,1
192,335982,2,10,2,AA Noffke,AB Agarkar,P Kumar,0,0,0,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,7,9,77.777778,1
193,335982,2,10,3,AA Noffke,AB Agarkar,P Kumar,0,0,0,0,0,NotOut,,,Royal Challengers Bangalore,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",63,47.382565,51.656894,152.349366,0,0,7,10,70.0,1


In [161]:
import pandas as pd

X = 5

# Store original order
merged_data = merged_data.reset_index()

# Convert 'BatsmanCumulativeRuns' to numeric
merged_data['BatsmanCumulativeRuns'] = pd.to_numeric(merged_data['BatsmanCumulativeRuns'], errors='coerce')

# Compute total runs per match for each batsman
batsman_runs_per_match = merged_data.groupby(['Batter', 'ID'], as_index=False)['BatsmanCumulativeRuns'].max()

# Sort before rolling operation
batsman_runs_per_match = batsman_runs_per_match.sort_values(by=['Batter', 'ID'])

# Compute rolling sum of past X innings
batsman_runs_per_match['BatsmanCurrentForm'] = (
    batsman_runs_per_match.groupby('Batter')['BatsmanCumulativeRuns']
    .apply(lambda x: x.shift(1).rolling(window=X, min_periods=1).sum())
    .reset_index(drop=True)  # Prevent index misalignment
)

# Merge back with original dataset
merged_data = merged_data.merge(batsman_runs_per_match[['Batter', 'ID', 'BatsmanCurrentForm']],
                                on=['Batter', 'ID'], how='left')

# Fill missing values with 0
merged_data['BatsmanCurrentForm'].fillna(0, inplace=True)

# Restore original order
merged_data = merged_data.sort_values(by=['index']).drop(columns=['index'])

# Display first 10 rows
merged_data.head(5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['BatsmanCurrentForm'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm
182,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0
14,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0
15,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0
16,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0
17,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0


In [162]:
merged_data.tail(5)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm
260883,1426312,2,9,5,SS Iyer,AK Markram,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,5,2,250.0,1,127.0
260917,1426312,2,9,6,VR Iyer,AK Markram,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,50,25,200.0,7,190.0
260918,1426312,2,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,51,26,196.153846,7,190.0
260884,1426312,2,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,6,3,200.0,1,127.0
260919,1426312,2,10,3,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,52,27,192.592593,7,190.0


#### Ballers

In [165]:
import numpy as np

# Calculate runs conceded per bowler per match
bowler_runs = merged_data.groupby(['Bowler', 'ID'], as_index=False)['TotalRun'].sum()

# Count valid deliveries bowled per bowler per match
bowler_balls = merged_data.groupby(['Bowler', 'ID'], as_index=False)['BallNumber'].count()
bowler_balls.rename(columns={'BallNumber': 'BallsBowled'}, inplace=True)

# Merge runs and balls data
bowler_stats = bowler_runs.merge(bowler_balls, on=['Bowler', 'ID'])

# Convert balls to overs
bowler_stats['Overs'] = bowler_stats['BallsBowled'] / 6

# Calculate Economy Rate
bowler_stats['BowlerEconomyRate'] = np.where(
    bowler_stats['Overs'] > 0, bowler_stats['TotalRun'] / bowler_stats['Overs'], np.nan
)

# Merge back into the main dataset
merged_data = merged_data.merge(bowler_stats[['Bowler', 'ID', 'BowlerEconomyRate']], on=['Bowler', 'ID'], how='left')

# Fill missing values with 0 (for bowlers who haven’t bowled yet)
merged_data['BowlerEconomyRate'].fillna(0, inplace=True)

# Display first few rows
merged_data.head(10)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['BowlerEconomyRate'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84
5,335982,1,0,6,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,5,0.0,0,0.0,9.84
6,335982,1,0,7,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,6,0.0,0,0.0,9.84
7,335982,1,1,1,BB McCullum,Z Khan,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,7,0.0,0,0.0,9.5
8,335982,1,1,2,BB McCullum,Z Khan,SC Ganguly,4,0,4,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,4,8,50.0,1,0.0,9.5
9,335982,1,1,3,BB McCullum,Z Khan,SC Ganguly,4,0,4,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,8,9,88.888889,2,0.0,9.5


In [167]:
# Count wickets taken by each bowler per match
bowler_wickets = (
    merged_data[merged_data['IsWicketDelivery'] == 1]  # Filter only wicket deliveries
    .groupby(['Bowler', 'ID'], as_index=False)['IsWicketDelivery']
    .sum()
)

# Rename column for clarity
bowler_wickets.rename(columns={'IsWicketDelivery': 'BowlerWickets'}, inplace=True)

# Merge back into the main dataset
merged_data = merged_data.merge(bowler_wickets, on=['Bowler', 'ID'], how='left')

# Fill missing values with 0 (for bowlers who didn't take any wickets)
merged_data['BowlerWickets'].fillna(0, inplace=True)

# Display first few rows
merged_data.head(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['BowlerWickets'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84,0.0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84,0.0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84,0.0


In [169]:
# Count total balls bowled per bowler per match
bowler_total_balls = (
    merged_data.groupby(['Bowler', 'ID'], as_index=False)['BallNumber'].count()
    .rename(columns={'BallNumber': 'BowlerTotalBalls'})
)

# Count total dot balls per bowler per match (TotalRun == 0 means dot ball)
bowler_dot_balls = (
    merged_data[merged_data['TotalRun'] == 0]  # Filter only dot balls
    .groupby(['Bowler', 'ID'], as_index=False)['TotalRun']
    .count()
    .rename(columns={'TotalRun': 'BowlerDotBalls'})
)

# Merge total balls and dot balls into the main dataset
merged_data = merged_data.merge(bowler_total_balls, on=['Bowler', 'ID'], how='left')
merged_data = merged_data.merge(bowler_dot_balls, on=['Bowler', 'ID'], how='left')

# Fill missing values (if a bowler has no dot balls, set it to 0)
merged_data['BowlerDotBalls'].fillna(0, inplace=True)

# Compute dot ball percentage
merged_data['BowlerDotBallPercentage'] = (merged_data['BowlerDotBalls'] / merged_data['BowlerTotalBalls']) * 100

# Fill NaN values with 0 (for cases where a bowler didn't bowl any balls)
merged_data['BowlerDotBallPercentage'].fillna(0, inplace=True)

# Display first few rows
merged_data.head(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['BowlerDotBalls'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['BowlerDotBallPercentage'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84,0.0,25,10.0,40.0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84,0.0,25,10.0,40.0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84,0.0,25,10.0,40.0


In [173]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           260920 non-null  int64  
 1   Innings                      260920 non-null  int64  
 2   Overs                        260920 non-null  int64  
 3   BallNumber                   260920 non-null  int64  
 4   Batter                       260920 non-null  object 
 5   Bowler                       260920 non-null  object 
 6   NonStriker                   260920 non-null  object 
 7   BatsmanRun                   260920 non-null  int64  
 8   ExtrasRun                    260920 non-null  int64  
 9   TotalRun                     260920 non-null  int64  
 10  NonBoundary                  260920 non-null  int64  
 11  IsWicketDelivery             260920 non-null  int64  
 12  PlayerOut                    260920 non-null  object 
 13 

#### Match Context Features

In [174]:
# Calculate total runs scored so far in the innings
merged_data['CumulativeRuns'] = merged_data.groupby(['ID', 'Innings'])['TotalRun'].cumsum()

# Calculate current run rate
merged_data['CurrentRunRate'] = merged_data['CumulativeRuns'] / (merged_data['Overs'] + (merged_data['BallNumber'] / 6))

merged_data.head(5)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,6.0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,3.0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,4.0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,3.0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,2.4


In [175]:
# Get target runs (1st innings total + 1)
target_runs = merged_data.groupby(['ID'])['TeamTotalRuns'].transform('max') + 1

# Compute overs remaining (for 2nd innings only)
merged_data['OversRemaining'] = 20 - (merged_data['Overs'] + (merged_data['BallNumber'] / 6))

# Required Run Rate calculation (for 2nd innings only)
merged_data['RequiredRunRate'] = (target_runs - merged_data['CumulativeRuns']) / merged_data['OversRemaining']

# Fill NaN values with 0 (only meaningful for 2nd innings)
merged_data.loc[merged_data['Innings'] == 1, 'RequiredRunRate'] = 0
merged_data['RequiredRunRate'].fillna(0, inplace=True)
merged_data.head(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['RequiredRunRate'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate,OversRemaining,RequiredRunRate
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,6.0,19.833333,0.0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,3.0,19.666667,0.0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,4.0,19.5,0.0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,3.0,19.333333,0.0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,2.4,19.166667,0.0


In [178]:
# Count total wickets fallen so far in the innings
merged_data['WicketsFallen'] = merged_data.groupby(['ID', 'Innings'])['IsWicketDelivery'].cumsum()

# Compute wickets left
merged_data['WicketsLeft'] = 10 - merged_data['WicketsFallen']

merged_data.tail(5)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate,OversRemaining,RequiredRunRate,WicketsFallen,WicketsLeft
260915,1426312,2,9,5,SS Iyer,AK Markram,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,5,2,250.0,1,127.0,5.0,0.0,6,2.0,33.333333,110,11.186441,10.166667,-0.590164,2,8
260916,1426312,2,9,6,VR Iyer,AK Markram,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,50,25,200.0,7,190.0,5.0,0.0,6,2.0,33.333333,111,11.1,10.0,-0.7,2,8
260917,1426312,2,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,51,26,196.153846,7,190.0,11.2,1.0,15,4.0,26.666667,112,11.016393,9.833333,-0.813559,2,8
260918,1426312,2,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,6,3,200.0,1,127.0,11.2,1.0,15,4.0,26.666667,113,10.935484,9.666667,-0.931034,2,8
260919,1426312,2,10,3,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,52,27,192.592593,7,190.0,11.2,1.0,15,4.0,26.666667,114,10.857143,9.5,-1.052632,2,8


In [179]:
# Avoid division by zero by replacing 0 current run rate with a small value
merged_data['PressureIndex'] = merged_data['RequiredRunRate'] / merged_data['CurrentRunRate']
merged_data['PressureIndex'].replace([float('inf'), -float('inf')], 0, inplace=True)
merged_data['PressureIndex'].fillna(0, inplace=True)

merged_data.tail(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['PressureIndex'].replace([float('inf'), -float('inf')], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['PressureIndex'].fillna(0, inplace=True)


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate,OversRemaining,RequiredRunRate,WicketsFallen,WicketsLeft,PressureIndex
260915,1426312,2,9,5,SS Iyer,AK Markram,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,5,2,250.0,1,127.0,5.0,0.0,6,2.0,33.333333,110,11.186441,10.166667,-0.590164,2,8,-0.052757
260916,1426312,2,9,6,VR Iyer,AK Markram,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,50,25,200.0,7,190.0,5.0,0.0,6,2.0,33.333333,111,11.1,10.0,-0.7,2,8,-0.063063
260917,1426312,2,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,51,26,196.153846,7,190.0,11.2,1.0,15,4.0,26.666667,112,11.016393,9.833333,-0.813559,2,8,-0.07385
260918,1426312,2,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,6,3,200.0,1,127.0,11.2,1.0,15,4.0,26.666667,113,10.935484,9.666667,-0.931034,2,8,-0.085139
260919,1426312,2,10,3,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,52,27,192.592593,7,190.0,11.2,1.0,15,4.0,26.666667,114,10.857143,9.5,-1.052632,2,8,-0.096953


#### Momentum Features

In [182]:
# Compute last 5 balls runs
merged_data['Last5BallsRuns'] = (
    merged_data.groupby(['ID', 'Innings'])['TotalRun']
    .rolling(window=5, min_periods=1)
    .sum()
    .reset_index(level=[0, 1], drop=True)
)

# Compute last 5 balls wickets
merged_data['Last5BallsWickets'] = (
    merged_data.groupby(['ID', 'Innings'])['IsWicketDelivery']
    .rolling(window=5, min_periods=1)
    .sum()
    .reset_index(level=[0, 1], drop=True)
)

# Calculate total runs scored in each over
merged_data['LastOverRuns'] = merged_data.groupby(['ID', 'Innings', 'Overs'])['TotalRun'].transform('sum')
# Define the partnership key using the two batters
merged_data['Partnership'] = merged_data.apply(lambda x: '_'.join(sorted([x['Batter'], x['NonStriker']])), axis=1)

# Compute cumulative partnership runs within each match and innings
merged_data['CurrentPartnershipRuns'] = merged_data.groupby(['ID', 'Innings', 'Partnership'])['TotalRun'].cumsum()

In [185]:
# Is Powerplay (1 if over is between 1-6, else 0)
merged_data['IsPowerplay'] = merged_data['Overs'].apply(lambda x: 1 if 0 <= x < 6 else 0)

# Is Middle Over (1 if over is between 7-15, else 0)
merged_data['IsMiddleOver'] = merged_data['Overs'].apply(lambda x: 1 if 6 <= x <= 14 else 0)

# Is Death Over (1 if over is between 16-20, else 0)
merged_data['IsDeathOver'] = merged_data['Overs'].apply(lambda x: 1 if 15 <= x <= 20 else 0)
merged_data.head(5)

Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate,OversRemaining,RequiredRunRate,WicketsFallen,WicketsLeft,PressureIndex,Last5BallsRuns,Last5BallsWickets,LastOverRuns,Partnership,CurrentPartnershipRuns,IsPowerplay,IsMiddleOver,IsDeathOver
0,335982,1,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,6.0,19.833333,0.0,0,10,0.0,1.0,0.0,3,BB McCullum_SC Ganguly,1,1,0,0
1,335982,1,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,1,0.0,0,0.0,9.84,0.0,25,10.0,40.0,1,3.0,19.666667,0.0,0,10,0.0,1.0,0.0,3,BB McCullum_SC Ganguly,1,1,0,0
2,335982,1,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,2,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,4.0,19.5,0.0,0,10,0.0,2.0,0.0,3,BB McCullum_SC Ganguly,2,1,0,0
3,335982,1,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,3,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,3.0,19.333333,0.0,0,10,0.0,2.0,0.0,3,BB McCullum_SC Ganguly,2,1,0,0
4,335982,1,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,NotOut,,,Kolkata Knight Riders,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",205,47.382565,51.656894,152.349366,0,0,0,4,0.0,0,0.0,9.84,0.0,25,10.0,40.0,2,2.4,19.166667,0.0,0,10,0.0,2.0,0.0,3,BB McCullum_SC Ganguly,2,1,0,0


In [187]:
def match_phase_weight(over):
    if 1 <= over <= 6:
        return 1.2  # Powerplay impact
    elif 7 <= over <= 15:
        return 1.0  # Middle overs impact
    else:
        return 1.5  # Death overs impact

merged_data['MatchPhaseImpact'] = merged_data['Overs'].apply(match_phase_weight)

In [188]:
# Group by Batsman-Bowler pair to get historical stats
matchup_stats = merged_data.groupby(['Batter', 'Bowler']).agg(
    BatsmanTotalRuns=('BatsmanRun', 'sum'),
    BatsmanBallsFaced=('BallNumber', 'count'),
    Dismissals=('IsWicketDelivery', 'sum')
).reset_index()

# Calculate Strike Rate & Dismissal Rate
matchup_stats['BatsmanVsBowlerStrikeRate'] = (matchup_stats['BatsmanTotalRuns'] / matchup_stats['BatsmanBallsFaced']) * 100
matchup_stats['BatsmanVsBowlerDismissalRate'] = matchup_stats['Dismissals'] / matchup_stats['BatsmanBallsFaced']

# Merge back to main dataset
merged_data = merged_data.merge(matchup_stats[['Batter', 'Bowler', 'BatsmanVsBowlerStrikeRate', 'BatsmanVsBowlerDismissalRate']],
                                on=['Batter', 'Bowler'], how='left')

merged_data.tail()


Unnamed: 0,ID,Innings,Overs,BallNumber,Batter,Bowler,NonStriker,BatsmanRun,ExtrasRun,TotalRun,NonBoundary,IsWicketDelivery,PlayerOut,Kind,FieldersInvolved,BattingTeam,Season,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,TeamTotalRuns,Team1WinPercentage,Team2WinPercentage,VenueAvgRuns,TossImpact,IsSuperOver,BatsmanCumulativeRuns,BatsmanBallsFaced,BatsmanStrikeRate,BatsmanCumulativeBoundaries,BatsmanCurrentForm,BowlerEconomyRate,BowlerWickets,BowlerTotalBalls,BowlerDotBalls,BowlerDotBallPercentage,CumulativeRuns,CurrentRunRate,OversRemaining,RequiredRunRate,WicketsFallen,WicketsLeft,PressureIndex,Last5BallsRuns,Last5BallsWickets,LastOverRuns,Partnership,CurrentPartnershipRuns,IsPowerplay,IsMiddleOver,IsDeathOver,MatchPhaseImpact,BatsmanVsBowlerStrikeRate,BatsmanVsBowlerDismissalRate
260915,1426312,2,9,5,SS Iyer,AK Markram,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,5,2,250.0,1,127.0,5.0,0.0,6,2.0,33.333333,110,11.186441,10.166667,-0.590164,2,8,-0.052757,4.0,0.0,5,SS Iyer_VR Iyer,8,0,1,0,1.0,100.0,0.0
260916,1426312,2,9,6,VR Iyer,AK Markram,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,50,25,200.0,7,190.0,5.0,0.0,6,2.0,33.333333,111,11.1,10.0,-0.7,2,8,-0.063063,3.0,0.0,5,SS Iyer_VR Iyer,9,0,1,0,1.0,80.0,0.0
260917,1426312,2,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,51,26,196.153846,7,190.0,11.2,1.0,15,4.0,26.666667,112,11.016393,9.833333,-0.813559,2,8,-0.07385,4.0,0.0,3,SS Iyer_VR Iyer,10,0,1,0,1.0,80.0,0.0
260918,1426312,2,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,6,3,200.0,1,127.0,11.2,1.0,15,4.0,26.666667,113,10.935484,9.666667,-0.931034,2,8,-0.085139,5.0,0.0,3,SS Iyer_VR Iyer,11,0,1,0,1.0,114.285714,0.142857
260919,1426312,2,10,3,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,0,0,NotOut,,,Kolkata Knight Riders,17,Sunrisers Hyderabad,Kolkata Knight Riders,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,bat,N,Kolkata Knight Riders,Wickets,8,MA Starc,"['Abhishek Sharma', 'TM Head', 'RA Tripathi', ...","['Rahmanullah Gurbaz', 'SP Narine', 'VR Iyer',...",103,44.940211,51.656894,150.295481,0,0,52,27,192.592593,7,190.0,11.2,1.0,15,4.0,26.666667,114,10.857143,9.5,-1.052632,2,8,-0.096953,5.0,0.0,3,SS Iyer_VR Iyer,12,0,1,0,1.0,80.0,0.0


### Model Training

In [196]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Using merged_data directly
df = merged_data.copy()

# Step 1: Train-Test Split based on Season
train_df = df[df["Season"] < 17]  # 2008-2023 for training
test_df = df[df["Season"] == 17]  # 2024 for testing

# Identify categorical features to encode
categorical_features = [
    "Batter", "Bowler", "NonStriker", "BattingTeam", "Team1", "Team2", "Venue",
    "TossWinner", "WinningTeam", "Player_of_Match", "TossDecision", "SuperOver", "WonBy"
]

# Apply Label Encoding using pd.Categorical to handle unseen values
for col in categorical_features:
    train_df[col] = train_df[col].astype("category")
    test_df[col] = pd.Categorical(test_df[col], categories=train_df[col].cat.categories)  # Ensures same mapping

    train_df[col] = train_df[col].cat.codes
    test_df[col] = test_df[col].cat.codes  # Unseen categories will be encoded as -1

# Define features (X) and target (y) for regression
feature_cols = [col for col in train_df.columns if col not in ["TotalRun", "ID", "Season"]]
X_train, y_train = train_df[feature_cols], train_df["TotalRun"]
X_test, y_test = test_df[feature_cols], test_df["TotalRun"]

# Check final dataset shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col] = train_df[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = pd.Categorical(test_df[col], categories=train_df[col].cat.categories)  # Ensures same mapping
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col] = train_df[col].cat.codes
A value

((243817, 60), (17103, 60), (243817,), (17103,))

In [None]:
import torch
from transformers import AutoTokenizer

# Define the model tokenizer
MODEL_NAME = "allenai/longformer-base-4096"  # Longformer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Convert features to text format (for Transformers)
def create_text_input(row):
    return (
        f"Batter: {row['Batter']} | "
        f"Bowler: {row['Bowler']} | "
        f"Venue: {row['Venue']} | "
        f"Batting Team: {row['BattingTeam']} | "
        f"Opposition: {row['Team2']} | "
        f"Toss Winner: {row['TossWinner']} | "
        f"Previous Runs: {row['TotalRun']}"
    )

# Apply to train/test data
train_texts = train_df.apply(create_text_input, axis=1).tolist()
test_texts = test_df.apply(create_text_input, axis=1).tolist()

# Encode text using Longformer tokenizer
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Convert targets to tensor
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
test_labels = torch.tensor(y_test.values, dtype=torch.float32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]