### **Step 1: Load and Clean Data**

In [12]:
import pandas as pd

# Input match type and gender
match_type = input("Enter Match Type:")
gender = input("Enter Gender:")

# Load the match summary data
match_summary = pd.read_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned.csv")
match_summary.drop(columns=["Unnamed: 0"], inplace=True)

# Select relevant columns
summary = match_summary[['match_id', 'season', 'match_date', 'match_type', 'event_name', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'ground']].copy()
summary

Unnamed: 0,match_id,season,match_date,match_type,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1438262,2024,2024-06-16,T20,Nordic T20 Cup,Finland,Norway,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava"
1,1438074,2024,2024-06-16,T20,Jersey tour of Denmark,Jersey,Denmark,Denmark,field,Jersey,"Svanholm Park, Brondby"
2,1438073,2024,2024-06-16,T20,Jersey tour of Denmark,Denmark,Jersey,Jersey,field,Jersey,"Svanholm Park, Brondby"
3,1436494,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,Italy,Romania,Romania,field,Italy,"Simar Cricket Ground, Rome"
4,1436493,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,Portugal,Isle of Man,Portugal,bat,Portugal,"Roma Cricket Ground, Spinaceto"
...,...,...,...,...,...,...,...,...,...,...,...
8210,237242,2006,2006-02-16,T20,,New Zealand,West Indies,New Zealand,field,tie,"Eden Park, Auckland"
8211,226374,2006,2006-01-09,T20,,Australia,South Africa,Australia,bat,Australia,"Brisbane Cricket Ground, Woolloongabba, Brisbane"
8212,222678,2005,2005-10-21,T20,,South Africa,New Zealand,New Zealand,field,New Zealand,"New Wanderers Stadium, Johannesburg"
8213,211028,2005,2005-06-13,T20,,England,Australia,England,bat,England,"The Rose Bowl, Southampton"


### **Step 2: Unique Event Names**

In [13]:
# Get unique events and count
unique_events = summary["event_name"].unique()
num_events = summary["event_name"].nunique()

# Save unique events to CSV
events_df = pd.DataFrame({"Unique Events": unique_events})
events_df.to_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{gender}_unique_events_summary.csv", index=False)

print("No of Events:", num_events)

No of Events: 364


### **Step 3: Swap Teams to Maintain Consistent Order**

In [14]:
# Create a column for sorted team tuples
summary["team_tuple"] = summary.apply(lambda row: tuple(sorted([row["team1"], row["team2"]])), axis=1)

# Identify tuples that appear in reverse order and swap team names where necessary
reverse_tuples = summary["team_tuple"].value_counts()[summary["team_tuple"].value_counts() > 1].index
for team_tuple in reverse_tuples:
    team1, team2 = team_tuple
    summary.loc[(summary["team1"] == team1) & (summary["team2"] == team2), ["team1", "team2"]] = team2, team1

# Drop the temporary column and filter the dataset
summary.drop(columns=["team_tuple"], inplace=True)
summary = summary[(summary["winner"] == summary["team1"]) | (summary["winner"] == summary["team2"])].sort_values(by="match_id", ascending=False).reset_index(drop=True)
summary.sort_values(by=["match_date", "match_id"], ascending=False, inplace=True)
summary[:10]

Unnamed: 0,match_id,season,match_date,match_type,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1438262,2024,2024-06-16,T20,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava"
3,1438074,2024,2024-06-16,T20,Jersey tour of Denmark,Jersey,Denmark,Denmark,field,Jersey,"Svanholm Park, Brondby"
4,1438073,2024,2024-06-16,T20,Jersey tour of Denmark,Jersey,Denmark,Jersey,field,Jersey,"Svanholm Park, Brondby"
5,1436494,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,Italy,Romania,Romania,field,Italy,"Simar Cricket Ground, Rome"
6,1436493,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,Portugal,Isle of Man,Portugal,bat,Portugal,"Roma Cricket Ground, Spinaceto"
7,1436492,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,France,Austria,Austria,field,Austria,"Simar Cricket Ground, Rome"
8,1436491,2024,2024-06-16,T20,ICC Men's T20 World Cup Sub Regional Europe Qu...,Luxembourg,Israel,Luxembourg,bat,Israel,"Roma Cricket Ground, Spinaceto"
316,1415736,2024,2024-06-16,T20,ICC Men's T20 World Cup,Pakistan,Ireland,Pakistan,field,Pakistan,Central Broward Regional Park Stadium Turf Gro...
405,1410427,2024,2024-06-16,T20,Vitality Blast,Yorkshire,Leicestershire,Leicestershire,bat,Leicestershire,"Headingley, Leeds"
406,1410426,2024,2024-06-16,T20,Vitality Blast,Worcestershire,Northamptonshire,Northamptonshire,bat,Northamptonshire,"County Ground, New Road, Worcester"


### **Step 4: Head-to-Head Winning Count**

In [15]:
# Calculate total matches played and head-to-head wins for team1
summary['h2h'] = summary.groupby(['team1', 'team2'])['match_id'].transform('count')
h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')

# Merge calculated wins back to the original DataFrame
summary = summary.merge(h2h_wins, on=['team1', 'team2'], how='left')
summary["h2h_team1_wins%"] = round((summary["h2h_team1_wins"] / summary["h2h"]) * 100).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'team1', 'team2', 'h2h', 'h2h_team1_wins']]

  h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')


Unnamed: 0,match_id,team1,team2,h2h,h2h_team1_wins
0,1438262,Norway,Finland,4,4
1,1438074,Jersey,Denmark,6,6
2,1438073,Jersey,Denmark,6,6
3,1436494,Italy,Romania,1,1
4,1436493,Portugal,Isle of Man,1,1
...,...,...,...,...,...
8009,238195,South Africa,Australia,25,8
8010,226374,South Africa,Australia,25,8
8011,222678,South Africa,New Zealand,15,11
8012,211028,England,Australia,22,11


### **Step 5: Head-to-Head Winning Count at a Venue**

In [16]:
# Standardize text data
summary['team1'] = summary['team1'].str.strip().str.title()
summary['team2'] = summary['team2'].str.strip().str.title()
summary['ground'] = summary['ground'].str.strip().str.title().fillna('Unknown')
summary['winner'] = summary['winner'].str.strip().str.title()

# Calculate matches played and wins at the same venue for team1
summary['h2h_venue'] = summary.groupby(['team1', 'team2', 'ground'])['match_id'].transform('count')
head_to_head = summary.groupby(['team1', 'team2', 'ground', 'winner']).size().unstack(fill_value=0)

# Create a column for wins by team1 at the same venue
def get_team1_wins(row):
    try:
        return head_to_head.loc[(row['team1'], row['team2'], row['ground']), row['team1']]
    except KeyError:
        return 0

summary['h2h_venue_team1_wins'] = summary.apply(get_team1_wins, axis=1)
summary['h2h_venue_team1_wins%'] = round((summary['h2h_venue_team1_wins'] / summary['h2h_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'match_date', 'team1', 'team2', 'ground', 'winner', 'h2h_venue', 'h2h_venue_team1_wins', 'h2h_venue_team1_wins%']].head()


Unnamed: 0,match_id,match_date,team1,team2,ground,winner,h2h_venue,h2h_venue_team1_wins,h2h_venue_team1_wins%
0,1438262,2024-06-16,Norway,Finland,"Kerava National Cricket Ground, Kerava",Norway,3,3,100
1,1438074,2024-06-16,Jersey,Denmark,"Svanholm Park, Brondby",Jersey,2,2,100
2,1438073,2024-06-16,Jersey,Denmark,"Svanholm Park, Brondby",Jersey,2,2,100
3,1436494,2024-06-16,Italy,Romania,"Simar Cricket Ground, Rome",Italy,1,1,100
4,1436493,2024-06-16,Portugal,Isle Of Man,"Roma Cricket Ground, Spinaceto",Portugal,1,1,100


### **Step 6: Team Wins at a Venue**

In [17]:
# Calculate matches played and wins at the same venue for team1 and team2
summary['team1_venue'] = summary.groupby(['team1', 'ground'])['match_id'].transform('count')
summary['team2_venue'] = summary.groupby(['team2', 'ground'])['match_id'].transform('count')

# Function to get team wins
def get_team_wins(row, team_col):
    try:
        return head_to_head.loc[(row[team_col], row['ground']), row[team_col]]
    except KeyError:
        return 0

summary['team1_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team1'), axis=1)
summary['team2_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team2'), axis=1)

# Calculate win percentages
summary['team1_venue_wins%'] = round((summary['team1_venue_wins'] / summary['team1_venue']) * 100, 2).fillna(0).astype(int)
summary['team2_venue_wins%'] = round((summary['team2_venue_wins'] / summary['team2_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'match_date', 'team1', 'team2', 'team1_venue_wins', 'team1_venue_wins%', 'team2_venue_wins', 'team2_venue_wins%', 'ground', 'winner']].head()

Unnamed: 0,match_id,match_date,team1,team2,team1_venue_wins,team1_venue_wins%,team2_venue_wins,team2_venue_wins%,ground,winner
0,1438262,2024-06-16,Norway,Finland,0,0,0,0,"Kerava National Cricket Ground, Kerava",Norway
1,1438074,2024-06-16,Jersey,Denmark,0,0,0,0,"Svanholm Park, Brondby",Jersey
2,1438073,2024-06-16,Jersey,Denmark,0,0,0,0,"Svanholm Park, Brondby",Jersey
3,1436494,2024-06-16,Italy,Romania,0,0,0,0,"Simar Cricket Ground, Rome",Italy
4,1436493,2024-06-16,Portugal,Isle Of Man,0,0,0,0,"Roma Cricket Ground, Spinaceto",Portugal


### **Step 7: Last 5 Matches Performance**

In [18]:
def last_5_matches_wins(summary_df):
    teams = set(summary_df['team1']).union(set(summary_df['team2']))
    last_5_matches_wins_combined = []

    for team in teams:
        team_matches = summary_df[(summary_df['team1'] == team) | (summary_df['team2'] == team)]
        last_5_team_matches = team_matches.sort_values(by=['match_date', 'match_id'], ascending=False).head(5)
        team_wins_count = len(last_5_team_matches.loc[last_5_team_matches['winner'] == team])
        last_5_matches_wins_combined.append({'team': team, 'team_last_5_wins': team_wins_count})

    return pd.DataFrame(last_5_matches_wins_combined)

last_5_matches_wins_df = last_5_matches_wins(summary)
last_5_team1_matches_wins_df = last_5_matches_wins_df.rename(columns={"team": "team1", "team_last_5_wins": "team1_last_5_wins"})
last_5_team2_matches_wins_df = last_5_matches_wins_df.rename(columns={"team": "team2", "team_last_5_wins": "team2_last_5_wins"})
summary = summary.merge(last_5_team1_matches_wins_df, on="team1", how="left")
summary = summary.merge(last_5_team2_matches_wins_df, on="team2", how="left")


### **Step 8: Event Name Standardization**

In [19]:
replacements = {
    "NatWest T20 Blast": "Vitality Blast",
    "HRV Twenty20": "Super Smash",
    "HRV Cup": "Super Smash",
    "MiWAY T20 Challenge": "CSA T20 Challenge",
    "Ram Slam T20 Challenge": "CSA T20 Challenge"
}

summary["event_name"] = summary["event_name"].replace(replacements, regex=True)

# Standardize event column based on team participation
team_event_counts = summary.groupby(['team1', 'team2'])['event_name'].nunique().reset_index()
multiple_events_teams = team_event_counts[team_event_counts['event_name'] > 1]
teams_in_multiple_events = set(multiple_events_teams['team1']).union(set(multiple_events_teams['team2']))
summary['event'] = summary.apply(lambda row: 'T20I' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'], axis=1)
summary['event'] = summary['event'].replace("Indian Premier League", "IPL", regex=True)


### **Step 9: Capitalizing IPL teams**

In [20]:
# Create a mask for rows where event_name is "IPL"
mask = summary["event_name"] == "IPL"

# Convert the specified columns to uppercase only for the rows where event_name is "IPL"
summary.loc[mask, 'team1'] = summary.loc[mask, 'team1'].str.upper()
summary.loc[mask, 'team2'] = summary.loc[mask, 'team2'].str.upper()
summary.loc[mask, 'toss_winner'] = summary.loc[mask, 'toss_winner'].str.upper()
summary.loc[mask, 'winner'] = summary.loc[mask, 'winner'].str.upper()

### **Step 10: Save Cleaned DataFrame**

In [22]:
# Select and reorder final columns
summary_df = summary[['match_id', 'season', 'event_name', 'event', 'match_date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'h2h_team1_wins%', 'h2h_venue_team1_wins%', 'team1_venue_wins%', 'team2_venue_wins%', 'team1_last_5_wins', 'team2_last_5_wins', 'ground']].copy()

# Save the cleaned summary DataFrame to CSV
summary_df.to_csv(fr"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned_features.csv", index=False)

summary_df.head()

Unnamed: 0,match_id,season,event_name,event,match_date,team1,team2,toss_winner,toss_decision,winner,h2h_team1_wins%,h2h_venue_team1_wins%,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,ground
0,1438262,2024,Nordic T20 Cup,T20I,2024-06-16,Norway,Finland,Finland,bat,Norway,100,100,0,0,4,1,"Kerava National Cricket Ground, Kerava"
1,1438074,2024,Jersey tour of Denmark,T20I,2024-06-16,Jersey,Denmark,Denmark,field,Jersey,100,100,0,0,3,0,"Svanholm Park, Brondby"
2,1438073,2024,Jersey tour of Denmark,T20I,2024-06-16,Jersey,Denmark,Jersey,field,Jersey,100,100,0,0,3,0,"Svanholm Park, Brondby"
3,1436494,2024,ICC Men's T20 World Cup Sub Regional Europe Qu...,T20I,2024-06-16,Italy,Romania,Romania,field,Italy,100,100,0,0,5,4,"Simar Cricket Ground, Rome"
4,1436493,2024,ICC Men's T20 World Cup Sub Regional Europe Qu...,T20I,2024-06-16,Portugal,Isle Of Man,Portugal,bat,Portugal,100,100,0,0,2,2,"Roma Cricket Ground, Spinaceto"
