In [7]:
import pandas as pd
import numpy as np
matches = pd.read_csv('/content/matches.csv')

cols_to_drop = ['match_type', 'player_of_match', 'target_runs', 'target_overs',
                'super_over', 'umpire1', 'umpire2', 'season', 'city', 'date',
                'toss_winner', 'toss_decision', 'result_margin', 'result', 'method',
                'team1', 'team2']
matches = matches.drop(columns=cols_to_drop)
print("Before handling the missing value")
print(matches.isnull().sum())
# Standardize team names '
team_mapping = {
    "Royal Challengers Bengaluru": "Royal Challengers Bangalore",
    "Rising Pune Supergiant": "Rising Pune Supergiants",
    "Delhi Daredevils": "Delhi Capitals",
    "Kings XI Punjab": "Punjab Kings"
}
matches['winner'] = matches['winner'].replace(team_mapping)
# Replace missing winners with 'No Result'
matches['winner'].fillna('No Result', inplace=True)
print("After handling the missing value")
# Verify that there are no more null values
print(matches.isnull().sum())


Before handling the missing value
id        0
venue     0
winner    5
dtype: int64
After handling the missing value
id        0
venue     0
winner    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  matches['winner'].fillna('No Result', inplace=True)


In [9]:
# Map venues to canonical names (customize mapping as needed)
venue_mapping = {
    "M Chinnaswamy Stadium": "M Chinnaswamy Stadium",
    "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",
    "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",

    "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association Stadium, Mohali",
    "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association Stadium, Mohali",
    "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association Stadium, Mohali",
    "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh": "Punjab Cricket Association Stadium, Mohali",

    "Wankhede Stadium": "Wankhede Stadium",
    "Wankhede Stadium, Mumbai": "Wankhede Stadium",

    "Eden Gardens": "Eden Gardens",
    "Eden Gardens, Kolkata": "Eden Gardens",

    "Sawai Mansingh Stadium": "Sawai Mansingh Stadium",
    "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",

    "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium, Hyderabad",
    "Rajiv Gandhi International Stadium, Uppal, Hyderabad": "Rajiv Gandhi International Stadium, Hyderabad",
    "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium, Hyderabad",

    "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium, Chepauk",
    "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium, Chepauk",
    "MA Chidambaram Stadium": "MA Chidambaram Stadium, Chepauk",

    "Dr DY Patil Sports Academy": "Dr DY Patil Sports Academy",
    "Dr DY Patil Sports Academy, Mumbai": "Dr DY Patil Sports Academy",

    "Brabourne Stadium": "Brabourne Stadium",
    "Brabourne Stadium, Mumbai": "Brabourne Stadium",

    "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium",
    "Himachal Pradesh Cricket Association Stadium, Dharamsala": "Himachal Pradesh Cricket Association Stadium",

    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium": "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium",
    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam": "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium",

    "Subrata Roy Sahara Stadium": "Subrata Roy Sahara Stadium",

    "Maharashtra Cricket Association Stadium": "Maharashtra Cricket Association Stadium",
    "Maharashtra Cricket Association Stadium, Pune": "Maharashtra Cricket Association Stadium",


    "Arun Jaitley Stadium": "Arun Jaitley Stadium, Delhi",
    "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium, Delhi",
    "Feroz Shah Kotla":"Arun Jaitley Stadium, Delhi",

}
matches['venue_canonical'] = matches['venue'].map(venue_mapping).fillna(matches['venue'])
matches.rename(columns={'id': 'match_id'}, inplace=True)

matches = matches[['match_id', 'winner', 'venue_canonical']]


In [12]:
import pandas as pd

# Load datasets
df = pd.read_csv("/content/deliveries (1).csv")
# Display basic info
print("Dataset Information:\n")
df.info()

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder         

In [13]:
# Define replacements for missing values
replacements = {
    "extras_type": "No Extra",
    "player_dismissed": "Not Out",
    "dismissal_kind": "Not Applicable",
    "fielder": "No Fielder Involved"
}
df.fillna(replacements, inplace=True)

In [15]:
cols_deliveries = ['match_id', 'inning', 'batting_team', 'bowling_team',
                   'over', 'ball', 'total_runs', 'is_wicket']
deliveries_subset = df[cols_deliveries].copy()

# Standardize team names in deliveries data
for col in ['batting_team', 'bowling_team']:
    deliveries_subset[col] = deliveries_subset[col].replace(team_mapping)

# Compute aggregated features from ball-by-ball data
deliveries_subset['cum_runs'] = deliveries_subset.groupby(['match_id', 'inning'])['total_runs'].cumsum()
deliveries_subset['cum_wickets'] = deliveries_subset.groupby(['match_id', 'inning'])['is_wicket'].cumsum()
deliveries_subset['overs_completed'] = deliveries_subset['over'] + (deliveries_subset['ball'] - 1) / 6
deliveries_subset['current_run_rate'] = np.where(
    deliveries_subset['overs_completed'] == 0,
    0,
    deliveries_subset['cum_runs'] / deliveries_subset['overs_completed']
)

In [16]:
# ==============================================
# 3. Compute Target & Required Run Rate (2nd Innings)
# ==============================================
# Compute first innings final score and target = score + 1
first_innings = deliveries_subset[deliveries_subset['inning'] == 1]
first_innings_final = first_innings.groupby('match_id')['cum_runs'].max().reset_index()
first_innings_final = first_innings_final.rename(columns={'cum_runs': 'first_innings_score'})
first_innings_final['target'] = first_innings_final['first_innings_score'] + 1

In [17]:
# ================================
# 4. Merge Data & Compute RRR
# ================================
# Merge deliveries with matches and target info
final_data = pd.merge(deliveries_subset, matches, on='match_id', how='left')
final_data = pd.merge(final_data, first_innings_final[['match_id', 'target']], on='match_id', how='left')

# Compute remaining overs (T20 match: 20 overs total)
remaining_overs = 20 - final_data['overs_completed']
final_data['required_run_rate'] = np.where(
    (final_data['inning'] == 2) & (remaining_overs > 0),
    (final_data['target'] - final_data['cum_runs']) / remaining_overs,
    0
)
final_data['required_run_rate'] = final_data['required_run_rate'].replace([np.inf, -np.inf], 0)

# Create target variable: win = 1 if batting_team equals winner, else 0.
final_data['win'] = (final_data['batting_team'] == final_data['winner']).astype(int)

# Filter to second innings only (since required run rate applies to 2nd innings)
final_data = final_data[final_data['inning'] == 2].copy()

In [18]:
# ====================================
# 5. Select Features for Live Prediction
# ====================================
# We need:
#   - Numeric: inning, cum_runs, cum_wickets, current_run_rate, required_run_rate, target
#   - Categorical (encoded): batting_team, bowling_team, venue_canonical
keep_cols = ['match_id', 'inning', 'cum_runs', 'cum_wickets', 'current_run_rate',
             'required_run_rate', 'target', 'batting_team', 'bowling_team', 'venue_canonical', 'win']
final_data = final_data[keep_cols]

In [19]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125741 entries, 124 to 260919
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   match_id           125741 non-null  int64  
 1   inning             125741 non-null  int64  
 2   cum_runs           125741 non-null  int64  
 3   cum_wickets        125741 non-null  int64  
 4   current_run_rate   125741 non-null  float64
 5   required_run_rate  125741 non-null  float64
 6   target             125741 non-null  int64  
 7   batting_team       125741 non-null  object 
 8   bowling_team       125741 non-null  object 
 9   venue_canonical    125741 non-null  object 
 10  win                125741 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 11.5+ MB


In [23]:
# ====================================
# 6. Encode Categorical Variables
# ====================================
# For team names
from sklearn.preprocessing import LabelEncoder
import joblib
le_team = LabelEncoder()
final_data['batting_team_encoded'] = le_team.fit_transform(final_data['batting_team'])
final_data['bowling_team_encoded'] = le_team.transform(final_data['bowling_team'])

# Save the team encoder
joblib.dump(le_team, 'le_team.pkl')

# For venue
le_venue = LabelEncoder()
final_data['venue_canonical_encoded'] = le_venue.fit_transform(final_data['venue_canonical'])

# Save the venue encoder
joblib.dump(le_venue, 'le_venue.pkl')

# Drop original categorical columns
final_data = final_data.drop(columns=['batting_team', 'bowling_team', 'venue_canonical'])

In [25]:
# ====================================
# 7. Final Feature Set and Train/Test Split
# ====================================

# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split

# Final features for training
final_features = ['inning', 'cum_runs', 'cum_wickets', 'current_run_rate',
                  'required_run_rate', 'target',
                  'batting_team_encoded', 'bowling_team_encoded', 'venue_canonical_encoded', 'win']

final_data = final_data[final_features]

# Perform a match-level split to avoid leakage
unique_matches = final_data['match_id'].unique() if 'match_id' in final_data.columns else np.unique(final_data.index)

# NOTE: Since we dropped match_id, we assume index represents different match snapshots; ideally, you'd preserve match_id
if 'match_id' in final_data.columns:
    split_ids = final_data['match_id'].unique()
    train_ids, test_ids = train_test_split(split_ids, test_size=0.2, random_state=42)
    train_data = final_data[final_data['match_id'].isin(train_ids)].copy()
    test_data = final_data[final_data['match_id'].isin(test_ids)].copy()

    # Drop match_id after splitting
    train_data = train_data.drop(columns=['match_id'])
    test_data = test_data.drop(columns=['match_id'])
else:
    # If match_id was dropped, use a random split (less ideal)
    train_data, test_data = train_test_split(final_data, test_size=0.2, random_state=42)

# Separate features and target
target_col = 'win'
X_train = train_data.drop(columns=[target_col])
y_train = train_data[target_col]
X_test = test_data.drop(columns=[target_col])
y_test = test_data[target_col]


In [29]:
# ====================================
# 8. Train Multiple Models and Evaluate
# ====================================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # Corrected Import
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                             use_label_encoder=False, random_state=42)
}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print("Training Accuracy: {:.2f}%".format(train_acc * 100))
    print("Testing Accuracy: {:.2f}%".format(test_acc * 100))
    print("Classification Report (Test):")
    print(classification_report(y_test, y_test_pred))
    print("Confusion Matrix (Test):")
    print(confusion_matrix(y_test, y_test_pred))
    print("-" * 60)


Model: Logistic Regression
Training Accuracy: 78.13%
Testing Accuracy: 77.78%
Classification Report (Test):
              precision    recall  f1-score   support

           0       0.78      0.75      0.76     12033
           1       0.78      0.81      0.79     13116

    accuracy                           0.78     25149
   macro avg       0.78      0.78      0.78     25149
weighted avg       0.78      0.78      0.78     25149

Confusion Matrix (Test):
[[ 8985  3048]
 [ 2539 10577]]
------------------------------------------------------------

Model: Random Forest
Training Accuracy: 99.99%
Testing Accuracy: 99.66%
Classification Report (Test):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12033
           1       1.00      1.00      1.00     13116

    accuracy                           1.00     25149
   macro avg       1.00      1.00      1.00     25149
weighted avg       1.00      1.00      1.00     25149

Confusion Matrix 

Parameters: { "use_label_encoder" } are not used.



Training Accuracy: 99.73%
Testing Accuracy: 99.53%
Classification Report (Test):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     12033
           1       0.99      1.00      1.00     13116

    accuracy                           1.00     25149
   macro avg       1.00      1.00      1.00     25149
weighted avg       1.00      1.00      1.00     25149

Confusion Matrix (Test):
[[11952    81]
 [   36 13080]]
------------------------------------------------------------


In [31]:
from sklearn.model_selection import RandomizedSearchCV

# ---------------------------
# Hyperparameter Tuning for Random Forest
# ---------------------------
rf = RandomForestClassifier(random_state=42)
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rand_search_rf = RandomizedSearchCV(
    rf,
    param_distributions=param_dist_rf,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
rand_search_rf.fit(X_train, y_train)
print("\nBest parameters for Random Forest:", rand_search_rf.best_params_)
print("Best CV score for Random Forest: {:.4f}".format(rand_search_rf.best_score_))


Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Best CV score for Random Forest: 0.9957


In [32]:
# Assume rand_search_rf.best_estimator_ is our final tuned Random Forest model
final_rf_model = rand_search_rf.best_estimator_
final_rf_model.fit(X_train, y_train)

# Evaluate on test data
y_test_pred = final_rf_model.predict(X_test)
print("Final Model Testing Accuracy: {:.2f}%".format(accuracy_score(y_test, y_test_pred)*100))
print("Final Model Classification Report (Test):")
print(classification_report(y_test, y_test_pred))
print("Final Model Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))

# Save the final model
joblib.dump(final_rf_model, 'final_rf_model.pkl')


Final Model Testing Accuracy: 99.66%
Final Model Classification Report (Test):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12033
           1       1.00      1.00      1.00     13116

    accuracy                           1.00     25149
   macro avg       1.00      1.00      1.00     25149
weighted avg       1.00      1.00      1.00     25149

Final Model Confusion Matrix (Test):
[[11986    47]
 [   38 13078]]


['final_rf_model.pkl']