### **Step 1: Load and Clean Data**

### **Genders Are male & female**

### **Match Types Are: ODI, T20, Test, MDM, ODM**



In [1]:
import pandas as pd

# Input match type and gender
match_type = input("Enter Match Type:")
gender = input("Enter Gender:")

print("Gender is: ", gender.capitalize())
print("Match Type is: ", match_type.capitalize())

# Load the match summary data
match_summary = pd.read_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned.csv")
#match_summary.drop(columns=["Unnamed: 0"], inplace=True)

# Select relevant columns
summary = match_summary[['match_id', 'season', 'match_date', 'event_name', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'ground']].copy()
summary

Gender is:  Male
Match Type is:  Odi


Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1469168,2025,2025-02-14,Australia tour of Sri Lanka,Sri Lanka,Australia,Sri Lanka,bat,Sri Lanka,"R.Premadasa Stadium, Khettarama, Colombo"
1,1468884,2025,2025-02-18,ICC Men's Cricket World Cup League 2,United States Of America,Oman,Oman,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...
2,1468883,2025,2025-02-16,ICC Men's Cricket World Cup League 2,Oman,Namibia,Namibia,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...
3,1468882,2025,2025-02-14,ICC Men's Cricket World Cup League 2,United States Of America,Namibia,Namibia,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...
4,1468881,2025,2025-02-12,ICC Men's Cricket World Cup League 2,United States Of America,Oman,United States of America,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...
...,...,...,...,...,...,...,...,...,...,...
2299,64819,2003,2003-01-14,India tour of New Zealand,New Zealand,India,New Zealand,field,New Zealand,"Westpac Park, Hamilton"
2300,64817,2003,2003-01-08,India tour of New Zealand,New Zealand,India,New Zealand,bat,India,"Westpac Stadium, Wellington"
2301,64816,2003,2003-01-04,India tour of New Zealand,New Zealand,India,New Zealand,field,New Zealand,"Davies Park, Queenstown"
2302,64815,2003,2003-01-01,India tour of New Zealand,New Zealand,India,India,bat,New Zealand,"Jade Stadium, Christchurch"


### **Step 2: Unique Event Names**

In [2]:
# Get unique events and count
unique_events = summary["event_name"].unique()
num_events = summary["event_name"].nunique()

# Save unique events to CSV
events_df = pd.DataFrame({"Unique Events": unique_events})
events_df.to_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{gender}_unique_events_summary.csv", index=False)

print("No of Events:", num_events)

No of Events: 273


### **Step 3: Swap Teams to Maintain Consistent Order**

In [3]:
# Create a column for sorted team tuples
summary["team_tuple"] = summary.apply(lambda row: tuple(sorted([row["team1"], row["team2"]])), axis=1)

# Identify tuples that appear in reverse order and swap team names where necessary
reverse_tuples = summary["team_tuple"].value_counts()[summary["team_tuple"].value_counts() > 1].index
for team_tuple in reverse_tuples:
    team1, team2 = team_tuple
    summary.loc[(summary["team1"] == team1) & (summary["team2"] == team2), ["team1", "team2"]] = team2, team1

# Drop the temporary column and filter the dataset
summary.drop(columns=["team_tuple"], inplace=True)
summary = summary[(summary["winner"] == summary["team1"]) | (summary["winner"] == summary["team2"])].sort_values(by="match_id", ascending=False).reset_index(drop=True)
summary.reset_index().sort_values(by=["match_date", "match_id"], ascending=False, inplace=True)
summary[:10]

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1469168,2025,2025-02-14,Australia tour of Sri Lanka,Sri Lanka,Australia,Sri Lanka,bat,Sri Lanka,"R.Premadasa Stadium, Khettarama, Colombo"
1,1468884,2025,2025-02-18,ICC Men's Cricket World Cup League 2,United States Of America,Oman,Oman,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...
2,1468883,2025,2025-02-16,ICC Men's Cricket World Cup League 2,Oman,Namibia,Namibia,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...
3,1468882,2025,2025-02-14,ICC Men's Cricket World Cup League 2,United States Of America,Namibia,Namibia,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...
4,1468881,2025,2025-02-12,ICC Men's Cricket World Cup League 2,United States Of America,Oman,United States of America,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...
5,1468880,2025,2025-02-10,ICC Men's Cricket World Cup League 2,Oman,Namibia,Oman,field,Namibia,Al Amerat Cricket Ground Oman Cricket (Ministr...
6,1468879,2025,2025-02-08,ICC Men's Cricket World Cup League 2,United States Of America,Namibia,Namibia,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...
7,1467703,2025,2025-02-18,Ireland tour of Zimbabwe,Zimbabwe,Ireland,Zimbabwe,field,Zimbabwe,"Harare Sports Club, Harare"
8,1467702,2025,2025-02-16,Ireland tour of Zimbabwe,Zimbabwe,Ireland,Ireland,field,Ireland,"Harare Sports Club, Harare"
9,1467701,2025,2025-02-14,Ireland tour of Zimbabwe,Zimbabwe,Ireland,Ireland,field,Zimbabwe,"Harare Sports Club, Harare"


### **Step 4: Head-to-Head Winning Count**

In [4]:
# Calculate total matches played and head-to-head wins for team1
summary['h2h'] = summary.groupby(['team1', 'team2'])['match_id'].transform('count')
h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')

# Merge calculated wins back to the original DataFrame
summary = summary.merge(h2h_wins, on=['team1', 'team2'], how='left')
summary["h2h_team1_wins%"] = round((summary["h2h_team1_wins"] / summary["h2h"]) * 100).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'team1', 'team2', 'h2h', 'h2h_team1_wins']]

  h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')


Unnamed: 0,match_id,team1,team2,h2h,h2h_team1_wins
0,1469168,Sri Lanka,Australia,55,22
1,1468884,United States Of America,Oman,7,2
2,1468883,Oman,Namibia,10,3
3,1468882,United States Of America,Namibia,10,7
4,1468881,United States Of America,Oman,7,2
...,...,...,...,...,...
2299,64819,New Zealand,India,48,21
2300,64817,New Zealand,India,48,21
2301,64816,New Zealand,India,48,21
2302,64815,New Zealand,India,48,21


### **Step 5: Head-to-Head Winning Count at a Venue**

In [5]:
# Standardize text data
summary['team1'] = summary['team1'].str.strip().str.title()
summary['team2'] = summary['team2'].str.strip().str.title()
summary['ground'] = summary['ground'].str.strip().str.title().fillna('Unknown')
summary['winner'] = summary['winner'].str.strip().str.title()

# Calculate matches played and wins at the same venue for team1
summary['h2h_venue'] = summary.groupby(['team1', 'team2', 'ground'])['match_id'].transform('count')
head_to_head = summary.groupby(['team1', 'team2', 'ground', 'winner']).size().unstack(fill_value=0)

# Create a column for wins by team1 at the same venue
def get_team1_wins(row):
    try:
        return head_to_head.loc[(row['team1'], row['team2'], row['ground']), row['team1']]
    except KeyError:
        return 0

summary['h2h_venue_team1_wins'] = summary.apply(get_team1_wins, axis=1)
summary['h2h_venue_team1_wins%'] = round((summary['h2h_venue_team1_wins'] / summary['h2h_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'match_date', 'team1', 'team2', 'ground', 'winner', 'h2h_venue', 'h2h_venue_team1_wins', 'h2h_venue_team1_wins%']].head()


Unnamed: 0,match_id,match_date,team1,team2,ground,winner,h2h_venue,h2h_venue_team1_wins,h2h_venue_team1_wins%
0,1469168,2025-02-14,Sri Lanka,Australia,"R.Premadasa Stadium, Khettarama, Colombo",Sri Lanka,10,6,60
1,1468884,2025-02-18,United States Of America,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...,United States Of America,3,1,33
2,1468883,2025-02-16,Oman,Namibia,Al Amerat Cricket Ground Oman Cricket (Ministr...,Oman,3,1,33
3,1468882,2025-02-14,United States Of America,Namibia,Al Amerat Cricket Ground Oman Cricket (Ministr...,United States Of America,2,2,100
4,1468881,2025-02-12,United States Of America,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...,Oman,3,1,33


In [6]:
norway = summary[((summary["team1"] == "Norway") & (summary["team2"] == "Finland") | (summary["team1"] == "Finland") & (summary["team2"] == "Norway")) 
                 & (summary["ground"] == "Kerava National Cricket Ground, Kerava")].reset_index(drop=True)
norway

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,h2h,h2h_team1_wins,h2h_team1_wins%,h2h_venue,h2h_venue_team1_wins,h2h_venue_team1_wins%


### **Step 6: Team Wins at a Venue**

In [7]:
import pandas as pd

# Assuming 'summary' is your DataFrame
# And 'head_to_head' is a DataFrame with team wins data

# Calculate matches played at the same venue for team1 and team2
summary['team1_venue'] = summary.groupby(['team1', 'ground'])['match_id'].transform('count')
summary['team2_venue'] = summary.groupby(['team2', 'ground'])['match_id'].transform('count')

# Function to get team wins
def get_team_wins(row, team_col):
    try:
        return head_to_head.loc[(row['team1'], row['team2'], row['ground']), row[team_col]]
    except KeyError:
        return 0
    except:
        return 0

# Apply the function to calculate wins at the venue
summary['team1_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team1'), axis=1)
summary['team2_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team2'), axis=1)

# Calculate win percentages
summary['team1_venue_wins%'] = round((summary['team1_venue_wins'] / summary['team1_venue']) * 100, 2).fillna(0).astype(int)
summary['team2_venue_wins%'] = round((summary['team2_venue_wins'] / summary['team2_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
display_cols = ['match_id', 'match_date', 'team1', 'team2', 'team1_venue_wins', 'team1_venue_wins%', 'team2_venue_wins', 'team2_venue_wins%', 'ground', 'winner']
summary[display_cols].head()


Unnamed: 0,match_id,match_date,team1,team2,team1_venue_wins,team1_venue_wins%,team2_venue_wins,team2_venue_wins%,ground,winner
0,1469168,2025-02-14,Sri Lanka,Australia,6,8,4,36,"R.Premadasa Stadium, Khettarama, Colombo",Sri Lanka
1,1468884,2025-02-18,United States Of America,Oman,1,16,2,22,Al Amerat Cricket Ground Oman Cricket (Ministr...,United States Of America
2,1468883,2025-02-16,Oman,Namibia,1,14,2,28,Al Amerat Cricket Ground Oman Cricket (Ministr...,Oman
3,1468882,2025-02-14,United States Of America,Namibia,2,33,0,0,Al Amerat Cricket Ground Oman Cricket (Ministr...,United States Of America
4,1468881,2025-02-12,United States Of America,Oman,1,16,2,22,Al Amerat Cricket Ground Oman Cricket (Ministr...,Oman


### **Step 7: Last 5 Matches Performance**

In [8]:
import pandas as pd

# Function to calculate last 5 matches wins for each match
def calculate_last_5_wins(df):
    df['team1_last_5_wins'] = 0
    df['team2_last_5_wins'] = 0
    
    teams = set(df['team1']).union(set(df['team2']))
    for team in teams:
        team_matches = df[(df['team1'] == team) | (df['team2'] == team)].sort_values(by=['match_date', 'match_id'], ascending=False)
        wins_count = []
        for i in range(len(team_matches)):
            match = team_matches.iloc[i]
            prev_matches = team_matches.iloc[i+1:i+6]  # next 5 matches in descending order
            wins = len(prev_matches[prev_matches['winner'] == team])
            wins_count.append(wins)
        
        # Ensure wins_count is aligned with the correct matches in the main DataFrame
        df.loc[df[df['team1'] == team].index, 'team1_last_5_wins'] = wins_count[:len(df[df['team1'] == team])]
        df.loc[df[df['team2'] == team].index, 'team2_last_5_wins'] = wins_count[:len(df[df['team2'] == team])]

    return df

# Calculate last 5 matches wins for each match
summary = calculate_last_5_wins(summary)

# Display the DataFrame
summary[['match_id', 'team1', 'team2', 'match_date', 'team1_last_5_wins', 'team2_last_5_wins']][:5]


Unnamed: 0,match_id,team1,team2,match_date,team1_last_5_wins,team2_last_5_wins
0,1469168,Sri Lanka,Australia,2025-02-14,3,1
1,1468884,United States Of America,Oman,2025-02-18,3,4
2,1468883,Oman,Namibia,2025-02-16,4,2
3,1468882,United States Of America,Namibia,2025-02-14,2,2
4,1468881,United States Of America,Oman,2025-02-12,3,4


### **Step 8: Event Name Standardization**

In [9]:
# Standardize event column based on team participation
team_event_counts = summary.groupby(['team1', 'team2'])['event_name'].nunique().reset_index()
multiple_events_teams = team_event_counts[team_event_counts['event_name'] > 1]
teams_in_multiple_events = set(multiple_events_teams['team1']).union(set(multiple_events_teams['team2']))

# List of events to be considered as T20I
t20i_events = [
    'T20I',
    'Central American Cricket Championships',
    'South American Men\'s Championships'
]

# Apply the condition to set the 'event' column
if match_type == "T20":
    summary['event'] = summary.apply(lambda row: 'T20I' if row['event_name'] in t20i_events or row['team1'] in teams_in_multiple_events or row['team2'] 
                                     in teams_in_multiple_events else row['event_name'], axis=1)

# Check if the match type is "ODI"
elif match_type == "ODI":
    summary['event'] = summary.apply(lambda row: 'ODI' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Check if the match type is "ODM"
elif match_type == "ODM":
    summary['event'] = summary.apply(lambda row: 'ODM' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Check if the match type is "MDM"
elif match_type == "MDM":
    summary['event'] = summary.apply(lambda row: 'MDM' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# If none of the above match types, assume it is a Test match
elif match_type == "Test":
    summary['event'] = summary.apply(lambda row: 'Test' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Display the first 10 rows of the summary DataFrame to check the updates
summary[:5]


Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,...,h2h_venue_team1_wins%,team1_venue,team2_venue,team1_venue_wins,team2_venue_wins,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,event
0,1469168,2025,2025-02-14,Australia tour of Sri Lanka,Sri Lanka,Australia,Sri Lanka,bat,Sri Lanka,"R.Premadasa Stadium, Khettarama, Colombo",...,60,68,11,6,4,8,36,3,1,ODI
1,1468884,2025,2025-02-18,ICC Men's Cricket World Cup League 2,United States Of America,Oman,Oman,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...,...,33,6,9,1,2,16,22,3,4,ODI
2,1468883,2025,2025-02-16,ICC Men's Cricket World Cup League 2,Oman,Namibia,Namibia,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...,...,33,7,7,1,2,14,28,4,2,ODI
3,1468882,2025,2025-02-14,ICC Men's Cricket World Cup League 2,United States Of America,Namibia,Namibia,field,United States Of America,Al Amerat Cricket Ground Oman Cricket (Ministr...,...,100,6,7,2,0,33,0,2,2,ODI
4,1468881,2025,2025-02-12,ICC Men's Cricket World Cup League 2,United States Of America,Oman,United States of America,bat,Oman,Al Amerat Cricket Ground Oman Cricket (Ministr...,...,33,6,9,1,2,16,22,3,4,ODI


In [10]:
summary['event'].unique()

array(['ODI'], dtype=object)

In [11]:
summary[summary['event'] == "Big Bash League"]

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,...,h2h_venue_team1_wins%,team1_venue,team2_venue,team1_venue_wins,team2_venue_wins,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,event


### **Step 9: Capitalizing IPL teams**

In [12]:
# Create a mask for rows where event_name is "IPL"
mask = summary["event_name"] == "IPL"

# Convert the specified columns to uppercase only for the rows where event_name is "IPL"
summary.loc[mask, 'team1'] = summary.loc[mask, 'team1'].str.upper()
summary.loc[mask, 'team2'] = summary.loc[mask, 'team2'].str.upper()
summary.loc[mask, 'toss_winner'] = summary.loc[mask, 'toss_winner'].str.upper()
summary.loc[mask, 'winner'] = summary.loc[mask, 'winner'].str.upper()

### **Step 10: Save Cleaned DataFrame**

In [75]:
# Select and reorder final columns
summary_df = summary[['match_id', 'season', 'event_name', 'event', 'match_date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'h2h_team1_wins%', 'h2h_venue_team1_wins%', 'team1_venue_wins%', 'team2_venue_wins%', 'team1_last_5_wins', 'team2_last_5_wins', 'ground']].copy()

# Save the cleaned summary DataFrame to CSV
summary_df.to_csv(fr"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned.csv", index=False)

summary_df.head()

Unnamed: 0,match_id,season,event_name,event,match_date,team1,team2,toss_winner,toss_decision,winner,h2h_team1_wins%,h2h_venue_team1_wins%,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,ground
0,1471842,2025,CWC Challenge League,ODM,2025-02-16,Tanzania,Singapore,Singapore,field,Tanzania,50,100,50,0,0,0,"Kowloon Cricket Club, Kowloon"
1,1471841,2025,CWC Challenge League,ODM,2025-02-16,Italy,Hong Kong,Italy,field,Italy,67,100,100,0,4,4,"Mission Road Ground, Mong Kok, Hong Kong, Mong..."
2,1471840,2025,CWC Challenge League,ODM,2025-02-15,Bahrain,Italy,Bahrain,bat,Italy,0,0,0,50,3,4,"Kowloon Cricket Club, Kowloon"
3,1471839,2025,CWC Challenge League,ODM,2025-02-15,Uganda,Tanzania,Tanzania,field,Uganda,100,100,50,0,5,0,"Mission Road Ground, Mong Kok, Hong Kong, Mong..."
4,1471838,2025,CWC Challenge League,ODM,2025-02-13,Uganda,Hong Kong,Hong Kong,field,Uganda,50,100,50,0,5,4,"Kowloon Cricket Club, Kowloon"
