In [1]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

In [3]:

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Load Data

df_matches = pd.read_csv("test_Matches_Data.csv", low_memory=False)

In [None]:
# Display Basic Info

df_matches.info()

In [None]:
# Display 10 Random Rows

df_matches.sample(n=10)

In [None]:
# Check for missing values

print("\nMissing Values in Matches Data:")
print(df_matches.isnull().sum())

In [None]:
# Summary Statistics

print("\nMatches Data Summary:")
print(df_matches.describe())

In [None]:
# Checking for duplicate entries

print("Duplicate Rows in Matches Data:", df_matches.duplicated().sum())

In [5]:
# Data cleaning

# Fill missing innings data with "Not Played"
innings_columns = ["Innings1 Team2 Runs Scored", "Innings1 Team2 Wickets Fell", "Innings1 Team2 Extras Rec", "Innings2 Team1 Runs Scored", "Innings2 Team1 Wickets Fell", "Innings2 Team1 Extras Rec", 
                   "Innings2 Team2 Runs Scored", "Innings2 Team2 Wickets Fell", "Innings2 Team2 Extras Rec"]
for col in innings_columns:
    df_matches[col] = df_matches[col].fillna("Not Played")

df_matches.fillna({"Match Winner": "No Result", "MOM Player": "None", "Umpire 1": "Unknown", "Umpire 2": "Unknown"}, inplace=True)

# Drop referee if too many missing values
if df_matches["Match Referee"].isna().mean() > 0.4:
    df_matches.drop(columns=["Match Referee"], inplace=True)

In [6]:
# Matches DataFrame

filtered_df_matches = df_matches[['Match ID', 'Match Start Date', 'Team1 Name', 'Team2 Name', 'Match Venue (Stadium)', 'Match Winner']]

# Convert 'Match Start Date' to datetime for sorting

filtered_df_matches.loc[:, "Match Start Date"] = pd.to_datetime(filtered_df_matches["Match Start Date"])

# Sort matches by date
filtered_df_matches = filtered_df_matches.sort_values(by="Match Start Date").reset_index(drop=True)

In [None]:
# Match Result Forecasting

In [None]:
# Normalize team order
filtered_df_matches["Team_A"], filtered_df_matches["Team_B"] = zip(
    *filtered_df_matches.apply(lambda row: sorted([row["Team1 Name"], row["Team2 Name"]]), axis=1)
)

# Aggregate match stats
matchup_stats = (
    filtered_df_matches.groupby(["Team_A", "Team_B"])
    .agg(
        Total_Matches=("Match ID", "count"),
        Team_A_Wins=("Match Winner", lambda x: sum(x == filtered_df_matches.loc[x.index, "Team_A"])),
        Team_B_Wins=("Match Winner", lambda x: sum(x == filtered_df_matches.loc[x.index, "Team_B"])),
        No_Result=("Match Winner", lambda x: sum(x == "No Result"))
    )
    .reset_index()
)

# Compute Losses
matchup_stats["Team_A_Losses"] = matchup_stats["Team_B_Wins"]
matchup_stats["Team_B_Losses"] = matchup_stats["Team_A_Wins"]

# Compute Percentages
matchup_stats["Team_A Win %"] = (matchup_stats["Team_A_Wins"] / matchup_stats["Total_Matches"]) * 100
matchup_stats["Team_B Win %"] = (matchup_stats["Team_B_Wins"] / matchup_stats["Total_Matches"]) * 100
matchup_stats["No Result %"] = (matchup_stats["No_Result"] / matchup_stats["Total_Matches"]) * 100
matchup_stats["Team_A Loss %"] = (matchup_stats["Team_A_Losses"] / matchup_stats["Total_Matches"]) * 100
matchup_stats["Team_B Loss %"] = (matchup_stats["Team_B_Losses"] / matchup_stats["Total_Matches"]) * 100

# Rename columns for clarity
matchup_stats.rename(columns={
    "Team_A": "Team1", "Team_B": "Team2",
    "Team_A_Wins": "Team1 Wins", "Team_B_Wins": "Team2 Wins",
    "Team_A_Losses": "Team1 Losses", "Team_B_Losses": "Team2 Losses",
    "Team_A Win %": "Team1 Win %", "Team_B Win %": "Team2 Win %",
    "Team_A Loss %": "Team1 Loss %", "Team_B Loss %": "Team2 Loss %",
    "No_Result": "No Result Matches", "No Result %": "No Result %"
}, inplace=True)

# Matchup DataFrame
matchup_stats.head()

In [None]:
# Compute stats for Team1
venue_stats = (
    filtered_df_matches.groupby(["Team1 Name", "Match Venue (Stadium)"])
    .agg(
        Total_Matches=("Match ID", "count"),
        Wins=("Match Winner", lambda x: sum(x == filtered_df_matches.loc[x.index, "Team1 Name"])),
        No_Result=("Match Winner", lambda x: sum(x == "No Result"))
    )
    .reset_index()
)

# Compute stats for Team2
venue_stats_team2 = (
    filtered_df_matches.groupby(["Team2 Name", "Match Venue (Stadium)"])
    .agg(
        Total_Matches=("Match ID", "count"),
        Wins=("Match Winner", lambda x: sum(x == filtered_df_matches.loc[x.index, "Team2 Name"])),
        No_Result=("Match Winner", lambda x: sum(x == "No Result"))
    )
    .reset_index()
)

# Standardize column names
venue_stats.rename(columns={"Team1 Name": "Team"}, inplace=True)
venue_stats_team2.rename(columns={"Team2 Name": "Team"}, inplace=True)

# Merge both datasets to consolidate stats
final_venue_stats = pd.concat([venue_stats, venue_stats_team2])

# Group by Team + Venue to sum up all stats
final_venue_stats = (
    final_venue_stats.groupby(["Team", "Match Venue (Stadium)"])
    .agg(
        Total_Matches=("Total_Matches", "sum"),
        Wins=("Wins", "sum"),
        No_Result=("No_Result", "sum")
    )
    .reset_index()
)

# Compute Losses
final_venue_stats["Losses"] = (
    final_venue_stats["Total_Matches"] - final_venue_stats["Wins"] - final_venue_stats["No_Result"]
)

# Compute Win %, Loss %, and No Result %
final_venue_stats["Win %"] = (final_venue_stats["Wins"] / final_venue_stats["Total_Matches"]) * 100
final_venue_stats["No Result %"] = (final_venue_stats["No_Result"] / final_venue_stats["Total_Matches"]) * 100
final_venue_stats["Loss %"] = (final_venue_stats["Losses"] / final_venue_stats["Total_Matches"]) * 100

# Venue DataFrame
final_venue_stats.head()

In [None]:
# Visualization

import plotly.graph_objects as go

def plot_team_performance(team_name, matchup_stats):
    # Filter data for the selected team
    team_data = matchup_stats[(matchup_stats["Team1"] == team_name) | (matchup_stats["Team2"] == team_name)].copy()

    # Determine opponent column dynamically
    team_data["Opponent"] = team_data.apply(lambda row: row["Team2"] if row["Team1"] == team_name else row["Team1"], axis=1)

    # Adjust win/loss columns based on the selected team
    team_data["Team Wins"] = team_data.apply(lambda row: row["Team1 Wins"] if row["Team1"] == team_name else row["Team2 Wins"], axis=1)
    team_data["Opponent Wins"] = team_data.apply(lambda row: row["Team2 Wins"] if row["Team1"] == team_name else row["Team1 Wins"], axis=1)
    team_data["Team Win %"] = team_data.apply(lambda row: row["Team1 Win %"] if row["Team1"] == team_name else row["Team2 Win %"], axis=1)
    team_data["Opponent Win %"] = team_data.apply(lambda row: row["Team2 Win %"] if row["Team1"] == team_name else row["Team1 Win %"], axis=1)

    team_data = team_data.sort_values(by=['Team Win %', 'No Result %', 'Opponent Win %', 'Team Wins'], ascending=[False, False, True, False])

    # Create the bar chart
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=team_data["Opponent"], 
        y=team_data["Total_Matches"], 
        name="Total Matches",
        marker_color="DodgerBlue"
    ))

    fig.add_trace(go.Bar(
        x=team_data["Opponent"], 
        y=team_data["Team Wins"], 
        name=f"{team_name} Wins",
        marker_color="green"
    ))

    fig.add_trace(go.Bar(
        x=team_data["Opponent"], 
        y=team_data["Opponent Wins"], 
        name="Opponent Wins",
        marker_color="red"
    ))

    fig.add_trace(go.Bar(
        x=team_data["Opponent"], 
        y=team_data["No Result Matches"], 
        name="No Result Matches",
        marker_color="orange"
    ))

    # Update layout
    fig.update_layout(
        title=f"{team_name} Performance Against Opponents",
        xaxis_title="Opponent",
        yaxis_title="Matches",
        barmode="group",
        legend_title="Match Outcome"
    )

    fig.show()

# Plot Matchup Stats
plot_team_performance("South Africa", matchup_stats)

In [None]:

def plot_top_venues_by_win_percentage(team_name, final_venue_stats):
    venue_data = final_venue_stats[(final_venue_stats["Team"] == team_name) & (final_venue_stats["Total_Matches"] >=5) ].copy() # Minimum 5 matches on that venue

    # Sort venues by Win % in descending order and take top 10
    top_venues = venue_data.sort_values(by=["Win %", "No Result %", "Loss %", "Wins"], ascending=[False, False, True, False]).head(20)

    # Create a horizontal bar chart
    fig = go.Figure()

    # Add separate bars for each metric
    fig.add_trace(go.Bar(
        y=top_venues["Match Venue (Stadium)"], 
        x=top_venues["Total_Matches"], 
        name="Total Matches",
        marker_color="DodgerBlue",
        orientation="h"
    ))

    fig.add_trace(go.Bar(
        y=top_venues["Match Venue (Stadium)"], 
        x=top_venues["Wins"], 
        name="Wins",
        marker_color="green",
        orientation="h"
    ))

    fig.add_trace(go.Bar(
        y=top_venues["Match Venue (Stadium)"], 
        x=top_venues["Losses"], 
        name="Losses",
        marker_color="red",
        orientation="h"
    ))

    fig.add_trace(go.Bar(
        y=top_venues["Match Venue (Stadium)"], 
        x=top_venues["No_Result"], 
        name="No Result Matches",
        marker_color="orange",
        orientation="h"
    ))

    fig.update_layout(
        title=f"Top 10 Venues with Highest Win % for {team_name} (Minimum 5 matches on that venue)",
        xaxis_title="Count",
        yaxis_title="Venue",
        yaxis=dict(autorange="reversed"),
        barmode="group",  # Grouped bars
        legend_title="Match Stats",
        height=1200
    )

    fig.show()

# Plot Venue Stats
plot_top_venues_by_win_percentage("South Africa", final_venue_stats)

In [13]:
# Initialize training data
training_data = []
target = []

for index, row in filtered_df_matches.iterrows():
    team1 = row['Team1 Name']
    team2 = row['Team2 Name']
    venue = row['Match Venue (Stadium)']
    winner = row['Match Winner']
    
    # Target encoding
    if winner == team1:
        target.append(0)  # Team1 wins
    elif winner == team2:
        target.append(1)  # Team2 wins
    else:
        target.append(2)  # No Result
    
    # Lookup matchup stats
    matchup = matchup_stats[(matchup_stats['Team1'] == team1) & (matchup_stats['Team2'] == team2)]
    if not matchup.empty:
        t1_win_pct = matchup['Team1 Win %'].values[0]
        t2_win_pct = matchup['Team2 Win %'].values[0]
        no_result_pct = matchup['No Result %'].values[0]
    else:
        t1_win_pct, t2_win_pct, no_result_pct = 33.33, 33.33, 33.33
    
    # Lookup venue stats for Team1
    venue_t1 = final_venue_stats[(final_venue_stats['Team'] == team1) & 
                                 (final_venue_stats['Match Venue (Stadium)'] == venue)]
    if not venue_t1.empty:
        t1_venue_win = venue_t1['Win %'].values[0]
        t1_venue_no_result = venue_t1['No Result %'].values[0]
        t1_venue_loss = venue_t1['Loss %'].values[0]
    else:
        t1_venue_win, t1_venue_no_result, t1_venue_loss = 33.33, 33.33, 33.33
    
    # Lookup venue stats for Team2
    venue_t2 = final_venue_stats[(final_venue_stats['Team'] == team2) & 
                                 (final_venue_stats['Match Venue (Stadium)'] == venue)]
    if not venue_t2.empty:
        t2_venue_win = venue_t2['Win %'].values[0]
        t2_venue_no_result = venue_t2['No Result %'].values[0]
        t2_venue_loss = venue_t2['Loss %'].values[0]
    else:
        t2_venue_win, t2_venue_no_result, t2_venue_loss = 33.33, 33.33
    
    # Combine features
    features = [t1_win_pct, t2_win_pct, no_result_pct, 
                t1_venue_win, t1_venue_no_result, t1_venue_loss,
                t2_venue_win, t2_venue_no_result, t2_venue_loss]
    training_data.append(features)

# Convert to DataFrame
training_df = pd.DataFrame(training_data, columns=[
    'T1_Win_Pct', 'T2_Win_Pct', 'No_Result_Pct',
    'T1_Venue_Win', 'T1_Venue_No_Result', 'T1_Venue_Loss',
    'T2_Venue_Win', 'T2_Venue_No_Result', 'T2_Venue_Loss'
])
target = pd.Series(target)

In [None]:

print("Training Data:")
training_df.head()

In [None]:

print("\nTarget:")
target.head()

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(training_df, target, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate Random Forest
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate XGBoost
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print("XGBoost Accuracy:", xgb_accuracy)

# Select the best model
best_model = xgb_model if xgb_accuracy > rf_accuracy else rf_model
best_accuracy = max(xgb_accuracy, rf_accuracy)
print("Best Model:", "XGBoost" if xgb_accuracy > rf_accuracy else "Random Forest")
print("Best Accuracy:", best_accuracy)

In [None]:
# Cross-Validation

from sklearn.model_selection import cross_val_score, GridSearchCV

# Random Forest with default parameters
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cv_scores = cross_val_score(rf_model, training_df, target, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Accuracy: ", rf_cv_scores.mean(), "±", rf_cv_scores.std())

# XGBoost with default parameters
try:
    xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
    xgb_cv_scores = cross_val_score(xgb_model, training_df, target, cv=5, scoring='accuracy')
    print("XGBoost Cross-Validation Accuracy: ", xgb_cv_scores.mean(), "±", xgb_cv_scores.std())
except AttributeError as e:
    print("Error with XGBoost and cross_val_score:", e)
    print("Falling back to manual cross-validation for XGBoost...")
    # Manual cross-validation as a fallback
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    xgb_scores = []
    for train_idx, test_idx in kf.split(training_df):
        X_train, X_test = training_df.iloc[train_idx], training_df.iloc[test_idx]
        y_train, y_test = target.iloc[train_idx], target.iloc[test_idx]
        xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
        xgb_model.fit(X_train, y_train)
        preds = xgb_model.predict(X_test)
        score = accuracy_score(y_test, preds)
        xgb_scores.append(score)
    xgb_cv_scores = np.array(xgb_scores)
    print("XGBoost Cross-Validation Accuracy (Manual): ", xgb_cv_scores.mean(), "±", xgb_cv_scores.std())

In [None]:
# Hyperparameter Tuning

# Random Forest Grid Search
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(training_df, target)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest CV Accuracy:", rf_grid_search.best_score_)

# XGBoost Grid Search
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0]
}
xgb_grid_search = GridSearchCV(XGBClassifier(eval_metric='mlogloss', random_state=42), 
                               xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(training_df, target)
print("Best XGBoost Parameters:", xgb_grid_search.best_params_)
print("Best XGBoost CV Accuracy:", xgb_grid_search.best_score_)

In [None]:
# Investigate Class Imbalance and Feature Quality

# Check class distribution in target
print("\nClass Distribution in Target:")
print(target.value_counts(normalize=True))

# Check feature quality
print("\nFeature Statistics:")
print(training_df.describe())

In [None]:
# Select the best model after tuning
best_model = xgb_grid_search.best_estimator_ if xgb_grid_search.best_score_ > rf_grid_search.best_score_ else rf_grid_search.best_estimator_
best_accuracy = max(xgb_grid_search.best_score_, rf_grid_search.best_score_)
print("\nBest Model After Tuning:", "XGBoost" if xgb_grid_search.best_score_ > rf_grid_search.best_score_ else "Random Forest")
print("Best Accuracy After Tuning:", best_accuracy)

# Save the best model for predictions
final_model = best_model

In [21]:

def predict_match_outcome(team1, team2, venue, model, matchup_stats, final_venue_stats):
    # Extract features for the input
    matchup = matchup_stats[(matchup_stats['Team1'] == team1) & (matchup_stats['Team2'] == team2)]
    if not matchup.empty:
        t1_win_pct = matchup['Team1 Win %'].values[0]
        t2_win_pct = matchup['Team2 Win %'].values[0]
        no_result_pct = matchup['No Result %'].values[0]
    else:
        t1_win_pct, t2_win_pct, no_result_pct = 33.33, 33.33, 33.33
    
    venue_t1 = final_venue_stats[(final_venue_stats['Team'] == team1) & 
                                 (final_venue_stats['Match Venue (Stadium)'] == venue)]
    if not venue_t1.empty:
        t1_venue_win = venue_t1['Win %'].values[0]
        t1_venue_no_result = venue_t1['No Result %'].values[0]
        t1_venue_loss = venue_t1['Loss %'].values[0]
    else:
        t1_venue_win, t1_venue_no_result, t1_venue_loss = 33.33, 33.33, 33.33
    
    venue_t2 = final_venue_stats[(final_venue_stats['Team'] == team2) & 
                                 (final_venue_stats['Match Venue (Stadium)'] == venue)]
    if not venue_t2.empty:
        t2_venue_win = venue_t2['Win %'].values[0]
        t2_venue_no_result = venue_t2['No Result %'].values[0]
        t2_venue_loss = venue_t2['Loss %'].values[0]
    else:
        t2_venue_win, t2_venue_no_result, t2_venue_loss = 33.33, 33.33
    
    # Combine features into a single row
    features = np.array([[t1_win_pct, t2_win_pct, no_result_pct, 
                          t1_venue_win, t1_venue_no_result, t1_venue_loss,
                          t2_venue_win, t2_venue_no_result, t2_venue_loss]])
    
    # Predict using the model
    prediction = model.predict(features)[0]
    
    # Decode the prediction
    if prediction == 0:
        return team1
    elif prediction == 1:
        return team2
    else:
        return "No Result"

In [None]:
# Prediction
team1 = "South Africa"
team2 = "New Zealand"
venue = "Lord's"
predicted_winner = predict_match_outcome(team1, team2, venue, final_model, matchup_stats, final_venue_stats)
print(f"Predicted Winner for {team1} vs {team2} at {venue}: {predicted_winner}")

In [None]:

from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix

# Cross-Validation with Detailed Metrics
print("Cross-Validation with Detailed Metrics for Final Model...")

# Define scoring metrics
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# Perform cross-validation with multiple metrics
cv_results = cross_validate(final_model, training_df, target, cv=5, scoring=scoring, return_train_score=False)

print("\nCross-Validation Results:")
print("Accuracy: ", cv_results['test_accuracy'].mean(), "±", cv_results['test_accuracy'].std())
print("Precision (Macro): ", cv_results['test_precision_macro'].mean(), "±", cv_results['test_precision_macro'].std())
print("Recall (Macro): ", cv_results['test_recall_macro'].mean(), "±", cv_results['test_recall_macro'].std())
print("F1-Score (Macro): ", cv_results['test_f1_macro'].mean(), "±", cv_results['test_f1_macro'].std())

In [None]:
# Compute Accuracy, Confusion Matrix, Precision, Recall, and F1-Score

# Make predictions on the test data
y_pred = final_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy on Test Set:", accuracy)

# Compute the confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Compute precision, recall, and F1-score
print("\nPrecision, Recall, and F1-Score:")
print(classification_report(y_test, y_pred, target_names=['Team1', 'Team2', 'No Result']))