In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import pickle


In [2]:
matches = pd.read_csv('IPL_Matches_2008_2022.csv')
balls   = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')

matches.head(), balls.head()


(        ID       City        Date Season  MatchNumber  \
 0  1312200  Ahmedabad  2022-05-29   2022        Final   
 1  1312199  Ahmedabad  2022-05-27   2022  Qualifier 2   
 2  1312198    Kolkata  2022-05-25   2022   Eliminator   
 3  1312197    Kolkata  2022-05-24   2022  Qualifier 1   
 4  1304116     Mumbai  2022-05-22   2022           70   
 
                          Team1                 Team2  \
 0             Rajasthan Royals        Gujarat Titans   
 1  Royal Challengers Bangalore      Rajasthan Royals   
 2  Royal Challengers Bangalore  Lucknow Super Giants   
 3             Rajasthan Royals        Gujarat Titans   
 4          Sunrisers Hyderabad          Punjab Kings   
 
                               Venue            TossWinner TossDecision  \
 0  Narendra Modi Stadium, Ahmedabad      Rajasthan Royals          bat   
 1  Narendra Modi Stadium, Ahmedabad      Rajasthan Royals        field   
 2             Eden Gardens, Kolkata  Lucknow Super Giants        field   
 3    

In [3]:
# Use only regular IPL (exclude Super Over etc)
balls = balls[balls['innings'].isin([1, 2])]

# Merge total innings score from matches/aggregated balls
innings_totals = (
    balls.groupby(['ID', 'innings'])['total_run']
    .sum()
    .reset_index()
    .rename(columns={'total_run': 'target'})
)

data = balls.merge(innings_totals, on=['ID', 'innings'], how='left')

# Sort by ball order
data = data.sort_values(['ID', 'innings', 'overs', 'ballnumber'])


In [4]:
# Cumulative runs and wickets at each ball
data['current_runs'] = data.groupby(['ID', 'innings'])['total_run'].cumsum()
data['dismissal'] = np.where(data['kind'].notna(), 1, 0)
data['current_wickets'] = data.groupby(['ID', 'innings'])['dismissal'].cumsum()

# Overs as float: e.g. 5.2 overs
data['over_float'] = data['overs'] + data['ballnumber'] / 10.0

# Runs & wickets in last 5 overs (30 balls)
def last_5(group):
    group = group.copy()
    group['runs_last_5'] = group['total_run'].rolling(window=30, min_periods=1).sum()
    group['wickets_last_5'] = group['dismissal'].rolling(window=30, min_periods=1).sum()
    return group

data = data.groupby(['ID', 'innings'], group_keys=False).apply(last_5)


  data = data.groupby(['ID', 'innings'], group_keys=False).apply(last_5)


In [5]:
# Merge team names and city from matches
data = data.merge(
    matches[['ID', 'Team1', 'Team2', 'City']],
    on='ID',
    how='left'
)

# Decide batting and bowling teams by innings
data['BattingTeam'] = np.where(data['innings'] == 1, data['Team1'], data['Team2'])
data['BowlingTeam'] = np.where(data['innings'] == 1, data['Team2'], data['Team1'])

# Keep only meaningful balls (e.g. after 5 overs)
model_df = data[
    (data['over_float'] >= 5.0)  # you can adjust this
][
    [
        'BattingTeam',
        'BowlingTeam',
        'City',
        'current_runs',
        'current_wickets',
        'over_float',
        'runs_last_5',
        'wickets_last_5',
        'target'
    ]
].dropna()


In [6]:
model_df = model_df.rename(columns={
    'current_runs':    'runs',
    'current_wickets': 'wickets',
    'over_float':      'overs'
})

model_df.head()


Unnamed: 0,BattingTeam,BowlingTeam,City,runs,wickets,overs,runs_last_5,wickets_last_5,target
32,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,61,0,5.1,59.0,0.0,222
33,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,61,1,5.2,59.0,1.0,222
34,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,61,1,5.3,59.0,1.0,222
35,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,61,1,5.4,59.0,1.0,222
36,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,61,1,5.5,58.0,1.0,222


In [7]:
X = model_df.drop('target', axis=1)
y = model_df['target']

cat_cols = ['BattingTeam', 'BowlingTeam', 'City']
num_cols = ['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('rf', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

print("Train R^2:", model.score(X_train, y_train))
print("Test  R^2:", model.score(X_test, y_test))


Train R^2: 0.9885852927803555
Test  R^2: 0.9153250986912839


In [8]:
import pickle

with open('../ipl1_model.pkl', 'wb') as f:
    pickle.dump(model, f)
