In [51]:
%pip install numpy pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import pickle
import warnings
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')


In [53]:
df = pd.read_csv('./matches.csv')
df_copy = df.copy()

In [54]:
useful_columns = ['result_margin', 'win_by_runs', 'win_by_wickets', 'target_runs', 'target_overs', 'eliminator', 'dl_applied']

existing_columns = [col for col in useful_columns if col in df_copy.columns]

In [55]:
df_copy['date'] = pd.to_datetime(df_copy['date'], format='%d-%m-%Y', errors='coerce')
df_copy['year'] = df_copy['date'].dt.year
df_copy['month'] = df_copy['date'].dt.month
df_copy['day_of_week'] = df_copy['date'].dt.dayofweek

In [56]:
CURRENT_YEAR = 2025
RECENT_SEASONS = 5
VERY_RECENT_SEASONS = 2
recent_cutoff = CURRENT_YEAR - RECENT_SEASONS
very_recent_cutoff = CURRENT_YEAR - VERY_RECENT_SEASONS

In [57]:
if 'super_over' in df_copy.columns:
    df_copy['super_over'] = df_copy['super_over'].map({'N': 0, 'Y': 1})
else:
    df_copy['super_over'] = 0

In [58]:
if 'method' in df_copy.columns:
    df_copy['method'] = df_copy['method'].map({np.nan: 0, 'D/L': 1})
else:
    df_copy['method'] = 0

In [59]:
df_copy['city'].fillna('Unknown', inplace=True)
df_copy['player_of_match'].fillna('Unknown', inplace=True)

In [60]:
team_names = {
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru',
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Rising Pune Supergiant': 'Rising Pune Supergiants',
}

In [61]:
for col in ['team1', 'team2', 'winner', 'toss_winner']:
    if col in df_copy.columns:
        df_copy[col].replace(team_names, inplace=True)

In [62]:
venue_mapping = {
    'M.Chinnaswamy Stadium': 'M. Chinnaswamy Stadium, Bengaluru',
    'M Chinnaswamy Stadium': 'M. Chinnaswamy Stadium, Bengaluru',
    'M Chinnaswamy Stadium, Bengaluru': 'M. Chinnaswamy Stadium, Bengaluru',
    'M.Chinnaswamy Stadium, Bengaluru': 'M. Chinnaswamy Stadium, Bengaluru',
    'MA Chidambaram Stadium, Chepauk': 'MA Chidambaram Stadium, Chepauk, Chennai',
    'MA Chidambaram Stadium': 'MA Chidambaram Stadium, Chepauk, Chennai', 
    'Maharashtra Cricket Association Stadium': 'Maharashtra Cricket Association Stadium, Pune',
    'Punjab Cricket Association IS Bindra Stadium, Mohali':'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh',
    'Rajiv Gandhi International Stadium': 'Rajiv Gandhi International Stadium, Uppal, Hyderabad',
    'Brabourne Stadium': 'Brabourne Stadium, Mumbai',
    'Punjab Cricket Association Stadium, Mohali': 'IS Bindra Stadium, Mohali',
    'Punjab Cricket Association IS Bindra Stadium': 'IS Bindra Stadium, Mohali',
    'Feroz Shah Kotla': 'Arun Jaitley Stadium, Delhi',
    'Arun Jaitley Stadium': 'Arun Jaitley Stadium, Delhi',
    'Wankhede Stadium': 'Wankhede Stadium, Mumbai',
    'Eden Gardens': 'Eden Gardens, Kolkata',
    'Dr DY Patil Sports Academy': 'Dr DY Patil Sports Academy, Mumbai',
    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium':'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam',
    'Himachal Pradesh Cricket Association Stadium': 'Himachal Pradesh Cricket Association Stadium, Dharamsala',
    'Rajiv Gandhi International Stadium, Uppal': 'Rajiv Gandhi International Stadium, Uppal, Hyderabad',
    'Sawai Mansingh Stadium':'Sawai Mansingh Stadium, Jaipur'
}

In [63]:
if 'venue' in df_copy.columns:
    df_copy['venue'].replace(venue_mapping, inplace=True)

In [64]:
df_copy.dropna(subset=['team1', 'team2', 'winner', 'toss_winner'], inplace=True)
print("Shape after cleaning:", df_copy.shape)


Shape after cleaning: (1090, 23)


In [65]:
df_copy['season'] = df_copy['year'].astype(str)
print("Available seasons:", df_copy['season'].unique())

Available seasons: ['2008' '2009' '2010' '2011' '2012' '2013' '2014' '2015' '2016' '2017'
 '2018' '2019' '2020' '2021' '2022' '2023' '2024']


In [66]:
recent_data = df_copy[df_copy['year'] >= recent_cutoff] 
team_recent_wins = recent_data['winner'].value_counts().to_dict()

In [67]:
very_recent_data = df_copy[df_copy['year'] >= very_recent_cutoff]
team_form_wins = very_recent_data['winner'].value_counts().to_dict()

In [68]:
h2h_recent_data = df_copy[df_copy['year'] >= recent_cutoff]
h2h_recent_records = {}

In [69]:
for index, match in h2h_recent_data.iterrows():
    team1 = match['team1'] 
    team2 = match['team2']
    winner = match['winner']
    
    # teamA & teamB treat same as teamB & teamA
    teams = tuple(sorted([team1, team2]))
    
    if teams not in h2h_recent_records:
        h2h_recent_records[teams] = {team1: 0, team2: 0}
        
    if winner in h2h_recent_records[teams]:
        h2h_recent_records[teams][winner] += 1

In [70]:
venue_recent_stats = {}

In [71]:
for idx, match in recent_data.iterrows():
    venue = match['venue']
    team1 = match['team1'] 
    team2 = match['team2']
    winner = match['winner']
    
    for team in [team1, team2]:
        key = (venue, team)
        if key not in venue_recent_stats:
            venue_recent_stats[key] = {'wins': 0, 'matches': 0}
            
        venue_recent_stats[key]['matches'] += 1
        
        if winner == team:
            venue_recent_stats[key]['wins'] += 1

In [72]:
venue_recent_avg_score = recent_data.groupby('venue')['target_runs'].mean().astype(int).to_dict()


In [73]:
venue_recent_batting_first = recent_data[recent_data['toss_decision'] == 'bat'].groupby('venue')['target_runs'].mean().astype(int).to_dict()


In [74]:
chasing_matches = recent_data[recent_data['toss_decision'] == 'field']
venue_recent_chase_success = (
    chasing_matches.groupby('venue')
    .apply(lambda x: (x['winner'] == x['team2']).mean() if not x.empty else 0.5)
    .to_dict()
)

In [75]:
venue_recent_bat_preference = recent_data.groupby('venue')['toss_decision'].apply(
    lambda x: (x == 'bat').mean()
).fillna(0.5).to_dict()


In [76]:
def calculate_team_season_performance(data, seasons_back=2):
    team_season_performance = {}
    
    for team in data['winner'].unique():
        team_performance = []
        
        for year in range(CURRENT_YEAR - seasons_back + 1, CURRENT_YEAR + 1):
            season_data = data[data['year'] == year]
            team_matches = season_data[(season_data['team1'] == team) | (season_data['team2'] == team)]
            team_wins = season_data[season_data['winner'] == team]
            
            if len(team_matches) > 0:
                win_rate = len(team_wins) / len(team_matches)
                weight = (year - CURRENT_YEAR + seasons_back) / seasons_back
                team_performance.append(win_rate * weight)
        
        team_season_performance[team] = np.mean(team_performance) if team_performance else 0.5
    
    return team_season_performance



In [77]:
recent_team_performance = calculate_team_season_performance(df_copy, VERY_RECENT_SEASONS)

In [78]:

if 'result_margin' in recent_data.columns:
    venue_recent_close_rate = recent_data.groupby('venue').apply(
        lambda x: (x['result_margin'] <= 10).mean() if len(x) > 0 else 0.5
    ).fillna(0.5).to_dict()
else:
    venue_recent_close_rate = {venue: 0.5 for venue in recent_data['venue'].unique()}

In [79]:
X_features = []
y_labels = []

In [80]:
for idx, match in df_copy.iterrows():
    team1 = match['team1'] 
    team2 = match['team2']
    winner = match['winner']
    venue = match['venue']
    toss_winner = match['toss_winner']
    toss_decision = match['toss_decision']
    match_year = match['year']
    
    # Only use matches for training that have sufficient recent context
    if match_year < recent_cutoff:
        continue
    
    # 1. Recent team strength (weighted combination of recent wins and form)
    team1_recent_wins = team_recent_wins.get(team1, 0)
    team2_recent_wins = team_recent_wins.get(team2, 0)
    team1_form = team_form_wins.get(team1, 0)
    team2_form = team_form_wins.get(team2, 0)
    
    # Combine recent performance with more weight on very recent form
    team1_strength = 0.6 * team1_form + 0.4 * team1_recent_wins
    team2_strength = 0.6 * team2_form + 0.4 * team2_recent_wins
    strength_difference = team1_strength - team2_strength
    
    # 2. Recent H2H advantage
    teams = tuple(sorted([team1, team2]))
    if teams in h2h_recent_records:
        team1_h2h = h2h_recent_records[teams].get(team1, 0)
        team2_h2h = h2h_recent_records[teams].get(team2, 0)
        total_h2h = team1_h2h + team2_h2h
        h2h_advantage = (team1_h2h - team2_h2h) / max(total_h2h, 1) if total_h2h >= 3 else 0
    else:
        h2h_advantage = 0
    
    # 3. Recent venue advantage
    team1_venue_pair = (venue, team1)
    team2_venue_pair = (venue, team2)
    
    team1_venue_rate = 0.5
    team2_venue_rate = 0.5
    
    if team1_venue_pair in venue_recent_stats and venue_recent_stats[team1_venue_pair]['matches'] >= 3:
        team1_venue_rate = venue_recent_stats[team1_venue_pair]['wins'] / venue_recent_stats[team1_venue_pair]['matches']
        
    if team2_venue_pair in venue_recent_stats and venue_recent_stats[team2_venue_pair]['matches'] >= 3:
        team2_venue_rate = venue_recent_stats[team2_venue_pair]['wins'] / venue_recent_stats[team2_venue_pair]['matches']
    
    venue_advantage = team1_venue_rate - team2_venue_rate
    
    # 4. Season performance trend
    team1_trend = recent_team_performance.get(team1, 0.5)
    team2_trend = recent_team_performance.get(team2, 0.5)
    performance_trend_diff = team1_trend - team2_trend
    
    # 5. Toss factors
    toss_advantage = 1 if toss_winner == team1 else 0
    bat_first = 1 if toss_decision == 'bat' else 0
    
    # 6. Recent venue characteristics
    venue_avg_score = venue_recent_avg_score.get(venue, 160)
    venue_bat_first_avg = venue_recent_batting_first.get(venue, 160)
    venue_chase_rate = venue_recent_chase_success.get(venue, 0.5)
    venue_bat_preference = venue_recent_bat_preference.get(venue, 0.5)
    venue_close_rate = venue_recent_close_rate.get(venue, 0.5)
    
    # 7. Smart toss decision
    toss_decision_smart = 1 if (toss_decision == 'bat' and venue_bat_preference > 0.6) or \
                              (toss_decision == 'field' and venue_bat_preference < 0.4) else 0
    
    # 8. Momentum indicators (matches in current season)
    current_season_data = df_copy[(df_copy['year'] == match_year) & (df_copy['date'] < match['date'])]
    
    team1_current_wins = len(current_season_data[current_season_data['winner'] == team1])
    team2_current_wins = len(current_season_data[current_season_data['winner'] == team2])
    
    team1_current_matches = len(current_season_data[(current_season_data['team1'] == team1) | (current_season_data['team2'] == team1)])
    team2_current_matches = len(current_season_data[(current_season_data['team1'] == team2) | (current_season_data['team2'] == team2)])
    
    team1_current_form = team1_current_wins / max(team1_current_matches, 1) if team1_current_matches > 0 else 0.5
    team2_current_form = team2_current_wins / max(team2_current_matches, 1) if team2_current_matches > 0 else 0.5
    current_form_diff = team1_current_form - team2_current_form
    
    # Feature vector (15 features focusing on recent data)
    X_features.append([
        team1_strength, team2_strength, strength_difference,
        h2h_advantage, venue_advantage, performance_trend_diff,
        toss_advantage, bat_first, toss_decision_smart,
        venue_avg_score, venue_chase_rate, venue_bat_preference,
        venue_close_rate, current_form_diff, 
        (venue_bat_first_avg - venue_avg_score) / max(venue_avg_score, 1)  # Batting first advantage
    ])
    
    # Target variable
    y_labels.append(1 if winner == team1 else 0)

In [81]:
X_features = np.array(X_features)
y_labels = np.array(y_labels)

In [82]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

In [83]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)


In [84]:
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

In [85]:
gb_accuracy


0.6617647058823529

In [86]:
xgb_accuracy

0.6176470588235294

In [87]:
final_model = gb_model if gb_accuracy >= xgb_accuracy else xgb_model

In [88]:
final_model

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [89]:
def predict_match_result_recent(team1, team2, venue, toss_winner, toss_decision, current_season_year=CURRENT_YEAR):
    # Recent team strength
    team1_recent_wins = team_recent_wins.get(team1, 0)
    team2_recent_wins = team_recent_wins.get(team2, 0)
    team1_form = team_form_wins.get(team1, 0)
    team2_form = team_form_wins.get(team2, 0)
    
    team1_strength = 0.6 * team1_form + 0.4 * team1_recent_wins
    team2_strength = 0.6 * team2_form + 0.4 * team2_recent_wins
    strength_difference = team1_strength - team2_strength
    
    # Recent H2H
    teams = tuple(sorted([team1, team2]))
    if teams in h2h_recent_records:
        team1_h2h = h2h_recent_records[teams].get(team1, 0)
        team2_h2h = h2h_recent_records[teams].get(team2, 0)
        total_h2h = team1_h2h + team2_h2h
        h2h_advantage = (team1_h2h - team2_h2h) / max(total_h2h, 1) if total_h2h >= 3 else 0
    else:
        h2h_advantage = 0
    
    # Recent venue advantage
    team1_venue_pair = (venue, team1)
    team2_venue_pair = (venue, team2)
    
    team1_venue_rate = 0.5
    team2_venue_rate = 0.5
    
    if team1_venue_pair in venue_recent_stats and venue_recent_stats[team1_venue_pair]['matches'] >= 3:
        team1_venue_rate = venue_recent_stats[team1_venue_pair]['wins'] / venue_recent_stats[team1_venue_pair]['matches']
        
    if team2_venue_pair in venue_recent_stats and venue_recent_stats[team2_venue_pair]['matches'] >= 3:
        team2_venue_rate = venue_recent_stats[team2_venue_pair]['wins'] / venue_recent_stats[team2_venue_pair]['matches']
    
    venue_advantage = team1_venue_rate - team2_venue_rate
    
    # Performance trends
    team1_trend = recent_team_performance.get(team1, 0.5)
    team2_trend = recent_team_performance.get(team2, 0.5)
    performance_trend_diff = team1_trend - team2_trend
    
    # Toss factors
    toss_advantage = 1 if toss_winner == team1 else 0
    bat_first = 1 if toss_decision == 'bat' else 0
    
    # Venue characteristics
    venue_avg_score = venue_recent_avg_score.get(venue, 160)
    venue_bat_first_avg = venue_recent_batting_first.get(venue, 160)
    venue_chase_rate = venue_recent_chase_success.get(venue, 0.5)
    venue_bat_preference = venue_recent_bat_preference.get(venue, 0.5)
    venue_close_rate = venue_recent_close_rate.get(venue, 0.5)
    
    toss_decision_smart = 1 if (toss_decision == 'bat' and venue_bat_preference > 0.6) or \
                              (toss_decision == 'field' and venue_bat_preference < 0.4) else 0
    
    # Current form (assume average for prediction)
    current_form_diff = 0  # This would need current season data
    
    # Create feature array
    X_input = np.array([[
        team1_strength, team2_strength, strength_difference,
        h2h_advantage, venue_advantage, performance_trend_diff,
        toss_advantage, bat_first, toss_decision_smart,
        venue_avg_score, venue_chase_rate, venue_bat_preference,
        venue_close_rate, current_form_diff,
        (venue_bat_first_avg - venue_avg_score) / max(venue_avg_score, 1)
    ]])
    
    # Scale and predict
    X_input_scaled = scaler.transform(X_input)
    team1_win_probability = final_model.predict_proba(X_input_scaled)[0][1]
    team2_win_probability = 1 - team1_win_probability
    predicted_winner = team1 if team1_win_probability > team2_win_probability else team2
    
    # Additional insights
    insights = {
        'strength_advantage': 'Team1' if strength_difference > 0.1 else 'Team2' if strength_difference < -0.1 else 'Balanced',
        'h2h_advantage': 'Team1' if h2h_advantage > 0.1 else 'Team2' if h2h_advantage < -0.1 else 'Even',
        'venue_advantage': 'Team1' if venue_advantage > 0.1 else 'Team2' if venue_advantage < -0.1 else 'Neutral',
        'toss_impact': 'High' if abs(venue_bat_preference - 0.5) > 0.2 else 'Medium' if abs(venue_bat_preference - 0.5) > 0.1 else 'Low'
    }
    
    return {
        'predicted_winner': predicted_winner,
        f'{team1}_probability': team1_win_probability,
        f'{team2}_probability': team2_win_probability,
        'confidence': max(team1_win_probability, team2_win_probability),
        'insights': insights,
        'recent_h2h_record': h2h_recent_records.get(teams, {team1: 0, team2: 0}) if teams in h2h_recent_records else None
    }

In [90]:
result2 = predict_match_result_recent(
    team1='Mumbai Indians',
    team2='Royal Challengers Bengaluru', 
    venue='Wankhede Stadium, Mumbai',
    toss_winner='Royal Challengers Bengaluru',
    toss_decision='bowl'
)
result2

{'predicted_winner': 'Royal Challengers Bengaluru',
 'Mumbai Indians_probability': np.float64(0.19010488117583724),
 'Royal Challengers Bengaluru_probability': np.float64(0.8098951188241628),
 'confidence': np.float64(0.8098951188241628),
 'insights': {'strength_advantage': 'Team2',
  'h2h_advantage': 'Team2',
  'venue_advantage': 'Neutral',
  'toss_impact': 'High'},
 'recent_h2h_record': {'Royal Challengers Bengaluru': 5, 'Mumbai Indians': 3}}

In [91]:
with open('recent_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)
    
with open('recent_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
with open('team_recent_wins.pkl', 'wb') as f:
    pickle.dump(team_recent_wins, f)
    
with open('team_form_wins.pkl', 'wb') as f:
    pickle.dump(team_form_wins, f)
    
with open('h2h_recent_records.pkl', 'wb') as f:
    pickle.dump(h2h_recent_records, f)
    
with open('venue_recent_stats.pkl', 'wb') as f:
    pickle.dump(venue_recent_stats, f)
    
with open('recent_team_performance.pkl', 'wb') as f:
    pickle.dump(recent_team_performance, f)
    
with open('venue_recent_features.pkl', 'wb') as f:
    pickle.dump({
        'avg_score': venue_recent_avg_score,
        'batting_first': venue_recent_batting_first,
        'chase_success': venue_recent_chase_success,
        'bat_preference': venue_recent_bat_preference,
        'close_rate': venue_recent_close_rate
    }, f)