#####  Step 1: Import Required Libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import pickle


##### Step 2: Load the Datasets

In [3]:
deliveries_df = pd.read_csv("deliveries.csv")
matches_df = pd.read_csv("matches.csv")

##### Step 3: Merge DataFrames

In [4]:
merged_df = deliveries_df.merge(matches_df, left_on='match_id', right_on='id')

#####  Step 4: Filter for 1st Innings Only

In [5]:
first_innings = merged_df[merged_df['inning'] == 1]

#####  Step 5: Extract Data from First 5 Overs

In [6]:
first_5_overs = first_innings[first_innings['over'] < 5].copy()

##### Step 6: Group and Summarize Early Match Stats

In [7]:
first_5_summary = (
    first_5_overs
    .groupby('match_id')
    .agg({
        'total_runs': 'sum',
        'is_wicket': 'sum',
        'batting_team': 'first',
        'bowling_team': 'first',
        'venue': 'first',
        'id': 'first'
    })
    .reset_index()
)


#####  Step 7: Get Final Score of First Innings

In [8]:
final_score = (
    first_innings.groupby('match_id')['total_runs']
    .sum()
    .reset_index()
    .rename(columns={'total_runs': 'final_score'})
)


##### Step 8: Merge for Complete Training Data

In [10]:
data = pd.merge(first_5_summary, final_score, on='match_id')

##### Step 9: Encode Categorical Features

In [None]:
le_team = LabelEncoder()
le_venue = LabelEncoder()

data['batting_team_enc'] = le_team.fit_transform(data['batting_team'])
data['bowling_team_enc'] = le_team.transform(data['bowling_team'])
data['venue_enc'] = le_venue.fit_transform(data['venue'])
 

##### Step 10: Prepare Features and Target

In [12]:
X = data[['total_runs', 'is_wicket', 'batting_team_enc', 'bowling_team_enc', 'venue_enc']]
y = data['final_score']


##### Step 11: Train-Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Step 12: Define Ensemble Model

In [14]:
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)

stacked_model = StackingRegressor(
    estimators=[
        ('lr', lr),
        ('rf', rf)
    ],
    final_estimator=GradientBoostingRegressor(n_estimators=100, random_state=42)
)


#####  Step 13: Train the Model

In [15]:
stacked_model.fit(X_train, y_train)

##### Step 14: Predict and Evaluate

In [16]:
y_pred = stacked_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")


MAE: 21.91
RMSE: 27.63
R^2 Score: 0.19


#####  Step 15: Save the Model

In [25]:
# Step 15: Save the Model and LabelEncoders
import os
os.makedirs("model", exist_ok=True)  # Make directory if it doesn't exist

# Save model
joblib.dump(stacked_model, 'model/ipl_score_predictor_model.pkl')

# Save encoders
joblib.dump(le_team, 'model/team_encoder.pkl')
joblib.dump(le_venue, 'model/venue_encoder.pkl')

print("Model and encoders saved to 'model/' directory.")


Model and encoders saved to 'model/' directory.
