In [29]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.3-py3-none-win_amd64.whl (149.9 MB)
   ---------------------------------------- 0.0/149.9 MB ? eta -:--:--
   ---------------------------------------- 1.6/149.9 MB 8.3 MB/s eta 0:00:18
    --------------------------------------- 3.7/149.9 MB 9.1 MB/s eta 0:00:17
   - -------------------------------------- 5.5/149.9 MB 9.1 MB/s eta 0:00:16
   - -------------------------------------- 7.3/149.9 MB 9.1 MB/s eta 0:00:16
   -- ------------------------------------- 9.2/149.9 MB 9.0 MB/s eta 0:00:16
   --- ------------------------------------ 11.3/149.9 MB 9.1 MB/s eta 0:00:16
   --- ------------------------------------ 13.1/149.9 MB 9.1 MB/s eta 0:00:15
   --- ------------------------------------ 14.9/149.9 MB 9.1 MB/s eta 0:00:15
   ---- ----------------------------------- 17.0/149.9 MB 9.3 MB/s eta 0:00:15
   ----- ---------------------------------- 18.9/149.9 MB 9.2 MB/s eta 0:00

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [32]:
deliveries = pd.read_csv('deliveries.csv')
matches = pd.read_csv('matches.csv')

In [34]:
player_match_df = deliveries.groupby(['match_id', 'batsman']).agg({
    'batsman_runs': 'sum',
    'ball': 'count',
    'bowling_team': lambda x: x.mode()[0],
    'batting_team': lambda x: x.mode()[0]
}).rename(columns={'batsman_runs': 'runs_scored', 'ball': 'balls_faced'}).reset_index()

In [35]:
player_match_df = player_match_df.sort_values(by=['batsman', 'match_id'])

player_match_df['rolling_avg_runs_3'] = (
    player_match_df.groupby('batsman')['runs_scored']
    .transform(lambda x: x.shift(1).rolling(window=3, min_periods=1).mean())
)

player_match_df['rolling_strike_rate_3'] = (
    100 * player_match_df.groupby('batsman')['runs_scored']
    .transform(lambda x: x.shift(1).rolling(3, min_periods=1).sum()) /
    player_match_df.groupby('batsman')['balls_faced']
    .transform(lambda x: x.shift(1).rolling(3, min_periods=1).sum())
)

player_match_df['matches_played'] = (
    player_match_df.groupby('batsman').cumcount()
)

In [36]:
# Keep only players with 20+ matches
counts = player_match_df['batsman'].value_counts()
frequent_players = counts[counts > 20].index
player_match_df = player_match_df[player_match_df['batsman'].isin(frequent_players)]

In [37]:
df = pd.get_dummies(player_match_df, columns=['batting_team', 'bowling_team'])

# Drop rows with NaNs from rolling stats
df = df.dropna(subset=['rolling_avg_runs_3', 'rolling_strike_rate_3'])

# Features and target
features = [col for col in df.columns if col not in ['runs_scored', 'batsman', 'match_id']]
X = df[features]
y = df['runs_scored']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [39]:
model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ R² Score: {r2:.4f}")

✅ RMSE: 8.78
✅ R² Score: 0.8403


In [40]:
import joblib
joblib.dump(model, 'ipl_predictor.pkl')

['ipl_predictor.pkl']

In [41]:
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

['feature_columns.pkl']

In [42]:
joblib.dump(X, "X_data.pkl") 

['X_data.pkl']