In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor

from sklearn.metrics import r2_score, mean_squared_error
import pickle


In [None]:
# Make sure CSVs are in the same folder as this notebook
balls = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')
matches = pd.read_csv('IPL_Matches_2008_2022.csv')

print("Balls shape:", balls.shape)
print("Matches shape:", matches.shape)

balls.head(), matches.head()


Balls shape: (225954, 17)
Matches shape: (950, 20)


(        ID  innings  overs  ballnumber       batter          bowler  \
 0  1312200        1      0           1  YBK Jaiswal  Mohammed Shami   
 1  1312200        1      0           2  YBK Jaiswal  Mohammed Shami   
 2  1312200        1      0           3   JC Buttler  Mohammed Shami   
 3  1312200        1      0           4  YBK Jaiswal  Mohammed Shami   
 4  1312200        1      0           5  YBK Jaiswal  Mohammed Shami   
 
    non-striker extra_type  batsman_run  extras_run  total_run  non_boundary  \
 0   JC Buttler        NaN            0           0          0             0   
 1   JC Buttler    legbyes            0           1          1             0   
 2  YBK Jaiswal        NaN            1           0          1             0   
 3   JC Buttler        NaN            0           0          0             0   
 4   JC Buttler        NaN            0           0          0             0   
 
    isWicketDelivery player_out kind fielders_involved       BattingTeam  
 0       

In [None]:
# Create BowlingTeam column because dataset only has BattingTeam

# Step 1: get batting teams per match & innings
teams = (
    balls.groupby(['ID', 'innings'])['BattingTeam']
    .unique()
    .reset_index()
)

# Step 2: build a lookup dict of batting teams in each innings
team_lookup = {}
for _, row in teams.iterrows():
    team_lookup.setdefault(row['ID'], {})[row['innings']] = list(row['BattingTeam'])

# Step 3: assign BowlingTeam = the team NOT batting
def get_bowling_team(row):
    teams_in_innings = team_lookup[row['ID']][row['innings']]
    batting_team = row['BattingTeam']
    # Return the opposite team
    for t in teams_in_innings:
        if t != batting_team:
            return t
    return batting_team  # fallback (should not happen)

balls['BowlingTeam'] = balls.apply(get_bowling_team, axis=1)

print(balls[['BattingTeam', 'BowlingTeam']].head())


        BattingTeam       BowlingTeam
0  Rajasthan Royals  Rajasthan Royals
1  Rajasthan Royals  Rajasthan Royals
2  Rajasthan Royals  Rajasthan Royals
3  Rajasthan Royals  Rajasthan Royals
4  Rajasthan Royals  Rajasthan Royals


In [None]:
# Aggregate per match (ID), batter, and BowlingTeam
batsman_stats = (
    balls
    .groupby(['ID', 'batter', 'BowlingTeam'])
    .agg(
        TotalRuns=('batsman_run', 'sum'),
        BallsFaced=('ballnumber', 'count'),
        Fours=('batsman_run', lambda x: (x == 4).sum()),
        Sixes=('batsman_run', lambda x: (x == 6).sum())
    )
    .reset_index()
)

batsman_stats.head()


Unnamed: 0,ID,batter,BowlingTeam,TotalRuns,BallsFaced,Fours,Sixes
0,335982,AA Noffke,Royal Challengers Bangalore,9,12,1,0
1,335982,B Akhil,Royal Challengers Bangalore,0,2,0,0
2,335982,BB McCullum,Kolkata Knight Riders,158,77,10,13
3,335982,CL White,Royal Challengers Bangalore,6,10,0,0
4,335982,DJ Hussey,Kolkata Knight Riders,12,12,1,0


In [None]:
# Strike rate: runs per 100 balls
batsman_stats['StrikeRate'] = np.where(
    batsman_stats['BallsFaced'] > 0,
    batsman_stats['TotalRuns'] * 100 / batsman_stats['BallsFaced'],
    0.0
)

# Avg4s and Avg6s per match (here: just the count in that match)
batsman_stats['Avg4s'] = batsman_stats['Fours'].astype(float)
batsman_stats['Avg6s'] = batsman_stats['Sixes'].astype(float)

# Simple impact score (you can tweak this later)
batsman_stats['ImpactScore'] = (
    batsman_stats['TotalRuns'] +
    0.5 * batsman_stats['Fours'] +
    1.0 * batsman_stats['Sixes']
)

batsman_stats.head()


Unnamed: 0,ID,batter,BowlingTeam,TotalRuns,BallsFaced,Fours,Sixes,StrikeRate,Avg4s,Avg6s,ImpactScore
0,335982,AA Noffke,Royal Challengers Bangalore,9,12,1,0,75.0,1.0,0.0,9.5
1,335982,B Akhil,Royal Challengers Bangalore,0,2,0,0,0.0,0.0,0.0,0.0
2,335982,BB McCullum,Kolkata Knight Riders,158,77,10,13,205.194805,10.0,13.0,176.0
3,335982,CL White,Royal Challengers Bangalore,6,10,0,0,60.0,0.0,0.0,6.0
4,335982,DJ Hussey,Kolkata Knight Riders,12,12,1,0,100.0,1.0,0.0,12.5


In [None]:
# Keep only context columns we care about
match_context = matches[['ID', 'City', 'TossDecision']]

# Merge context into batsman_stats
batsman_final = batsman_stats.merge(match_context, on='ID', how='left')

batsman_final.head()


Unnamed: 0,ID,batter,BowlingTeam,TotalRuns,BallsFaced,Fours,Sixes,StrikeRate,Avg4s,Avg6s,ImpactScore,City,TossDecision
0,335982,AA Noffke,Royal Challengers Bangalore,9,12,1,0,75.0,1.0,0.0,9.5,Bangalore,field
1,335982,B Akhil,Royal Challengers Bangalore,0,2,0,0,0.0,0.0,0.0,0.0,Bangalore,field
2,335982,BB McCullum,Kolkata Knight Riders,158,77,10,13,205.194805,10.0,13.0,176.0,Bangalore,field
3,335982,CL White,Royal Challengers Bangalore,6,10,0,0,60.0,0.0,0.0,6.0,Bangalore,field
4,335982,DJ Hussey,Kolkata Knight Riders,12,12,1,0,100.0,1.0,0.0,12.5,Bangalore,field


In [None]:
# Drop rows with any missing required values
required_cols = [
    'batter', 'BowlingTeam', 'City', 'TossDecision',
    'TotalRuns', 'StrikeRate', 'Avg4s', 'Avg6s', 'ImpactScore'
]

batsman_final = batsman_final.dropna(subset=required_cols)

# You can also filter unrealistic entries (optional)
# e.g. at least 5 balls faced
batsman_final = batsman_final[batsman_final['BallsFaced'] >= 5]

batsman_final[required_cols].describe()


Unnamed: 0,TotalRuns,StrikeRate,Avg4s,Avg6s,ImpactScore
count,10453.0,10453.0,10453.0,10453.0,10453.0
mean,24.839855,117.92692,2.276093,0.940687,26.918588
std,21.26893,51.979605,2.396443,1.420196,23.333559
min,0.0,0.0,0.0,0.0,0.0
25%,9.0,84.210526,0.0,0.0,9.5
50%,18.0,116.666667,2.0,0.0,20.0
75%,35.0,148.0,3.0,1.0,38.0
max,175.0,422.222222,19.0,17.0,198.5


In [None]:
features = ['batter', 'BowlingTeam', 'City', 'TossDecision']
target_columns = ['TotalRuns', 'StrikeRate', 'Avg4s', 'Avg6s', 'ImpactScore']

batsman_model_df = batsman_final[
    (batsman_final['BallsFaced'] >= 10) &   # faced at least 10 balls
    (batsman_final['TotalRuns'] >= 10)      # scored at least 10 runs
].dropna(subset=features + target_columns)

X = batsman_model_df[features]
y = batsman_model_df[target_columns]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (5523, 4) Test size: (1381, 4)


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        (
            'cat',
            OneHotEncoder(handle_unknown='ignore'),
            ['batter', 'BowlingTeam', 'City', 'TossDecision']
        )
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(
        RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        )
    ))
])

pipeline


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,RandomForestR...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
pipeline.fit(X_train, y_train)

# Predictions on test set
y_pred = pipeline.predict(X_test)

# Evaluate TotalRuns specifically
r2_total = r2_score(y_test['TotalRuns'], y_pred[:, 0])
mse_total = mean_squared_error(y_test['TotalRuns'], y_pred[:, 0])

print("TotalRuns - Test R²:", round(r2_total, 3))
print("TotalRuns - Test MSE:", round(mse_total, 3))

# (Optional) check one example
y_test_sample = y_test.iloc[0]
y_pred_sample = y_pred[0]

print("\nSample true:", y_test_sample.to_dict())
print("Sample pred:", {
    'TotalRuns': round(float(y_pred_sample[0]), 1),
    'StrikeRate': round(float(y_pred_sample[1]), 1),
    'Avg4s': round(float(y_pred_sample[2]), 1),
    'Avg6s': round(float(y_pred_sample[3]), 1),
    'ImpactScore': round(float(y_pred_sample[4]), 1),
})


TotalRuns - Test R²: -0.093
TotalRuns - Test MSE: 474.506

Sample true: {'TotalRuns': 13.0, 'StrikeRate': 92.85714285714286, 'Avg4s': 2.0, 'Avg6s': 0.0, 'ImpactScore': 14.0}
Sample pred: {'TotalRuns': 22.9, 'StrikeRate': 91.1, 'Avg4s': 3.3, 'Avg6s': 0.2, 'ImpactScore': 25.9}


In [None]:
# Save the trained pipeline one level up so it's next to app.py
with open('../batsmanmodel.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print("Saved batsmanmodel.pkl")


Saved batsmanmodel.pkl


In [None]:
test_df = pd.DataFrame([{
    "batter": "Virat Kohli",
    "BowlingTeam": "Chennai Super Kings",
    "City": "Bangalore",
    "TossDecision": "bat"
}])

pred = pipeline.predict(test_df)[0]

print({
    "TotalRuns":   round(float(pred[0]), 1),
    "StrikeRate":  round(float(pred[1]), 1),
    "Avg4s":       round(float(pred[2]), 1),
    "Avg6s":       round(float(pred[3]), 1),
    "ImpactScore": round(float(pred[4]), 1),
})


{'TotalRuns': 28.5, 'StrikeRate': 129.5, 'Avg4s': 2.6, 'Avg6s': 1.8, 'ImpactScore': 30.9}


In [None]:
import pickle

with open("batsmanmodel.pkl", "wb") as f:
    pickle.dump(pipeline, f)


In [None]:
with open("batsmanmodel.pkl", "rb") as f:
    test_model = pickle.load(f)

test_pred = test_model.predict(test_df)[0]
print(test_pred)


[ 28.48875    129.53971011   2.55558333   1.79952381  30.90145833]
