#### **Match Types** 
1. T20I 
2. ODI 
3. Test 
4. ODM 
5. MDM

#### **Genders are**
1. Male 
2. Female

### **Step 1: Load and Clean Data**

In [15]:
import pandas as pd

# Input match type and gender
match_type = input("Enter Match Type:")
gender = input("Enter Gender:")

# Load the match summary data
match_summary = pd.read_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned.csv")
#match_summary.drop(columns=["Unnamed: 0"], inplace=True)

# Select relevant columns
summary = match_summary[['match_id', 'season', 'match_date', 'event_name', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'ground']].copy()
summary

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1415753,2024,2024-06-26,ICC Men's T20 World Cup,Afghanistan,South Africa,Afghanistan,bat,South Africa,"Brian Lara Stadium, Tarouba, Trinidad, Tarouba"
1,1415752,2024,2024-06-24,ICC Men's T20 World Cup,Afghanistan,Bangladesh,Afghanistan,bat,Afghanistan,"Arnos Vale Ground, Kingstown, St Vincent, King..."
2,1415751,2024,2024-06-24,ICC Men's T20 World Cup,India,Australia,Australia,field,India,"Darren Sammy National Cricket Stadium, Gros Is..."
3,1440131,2024,2024-06-23,Jersey tour of Guernsey,Jersey,Guernsey,Guernsey,field,Guernsey,"Guernsey Rovers Athletic Club Ground, Port Soi..."
4,1415750,2024,2024-06-23,ICC Men's T20 World Cup,West Indies,South Africa,South Africa,field,South Africa,"Sir Vivian Richards Stadium, North Sound, Antigua"
...,...,...,...,...,...,...,...,...,...,...
8249,237242,2006,2006-02-16,,New Zealand,West Indies,New Zealand,field,tie,"Eden Park, Auckland"
8250,226374,2006,2006-01-09,,Australia,South Africa,Australia,bat,Australia,"Brisbane Cricket Ground, Woolloongabba, Brisbane"
8251,222678,2005,2005-10-21,,South Africa,New Zealand,New Zealand,field,New Zealand,"New Wanderers Stadium, Johannesburg"
8252,211028,2005,2005-06-13,,England,Australia,England,bat,England,"The Rose Bowl, Southampton"


### **Step 2: Unique Event Names**

In [16]:
# Get unique events and count
unique_events = summary["event_name"].unique()
num_events = summary["event_name"].nunique()

# Save unique events to CSV
events_df = pd.DataFrame({"Unique Events": unique_events})
events_df.to_csv(rf"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{gender}_unique_events_summary.csv", index=False)

print("No of Events:", num_events)

No of Events: 360


### **Step 3: Swap Teams to Maintain Consistent Order**

In [17]:
# Create a column for sorted team tuples
summary["team_tuple"] = summary.apply(lambda row: tuple(sorted([row["team1"], row["team2"]])), axis=1)

# Identify tuples that appear in reverse order and swap team names where necessary
reverse_tuples = summary["team_tuple"].value_counts()[summary["team_tuple"].value_counts() > 1].index
for team_tuple in reverse_tuples:
    team1, team2 = team_tuple
    summary.loc[(summary["team1"] == team1) & (summary["team2"] == team2), ["team1", "team2"]] = team2, team1

# Drop the temporary column and filter the dataset
summary.drop(columns=["team_tuple"], inplace=True)
summary = summary[(summary["winner"] == summary["team1"]) | (summary["winner"] == summary["team2"])].sort_values(by="match_id", ascending=False).reset_index(drop=True)
summary.reset_index().sort_values(by=["match_date", "match_id"], ascending=False, inplace=True)
summary[:10]

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground
0,1440131,2024,2024-06-23,Jersey tour of Guernsey,Jersey,Guernsey,Guernsey,field,Guernsey,"Guernsey Rovers Athletic Club Ground, Port Soi..."
1,1440130,2024,2024-06-22,Jersey tour of Guernsey,Jersey,Guernsey,Jersey,field,Jersey,"King George V Sports Ground, Castel"
2,1440129,2024,2024-06-22,Jersey tour of Guernsey,Jersey,Guernsey,Jersey,field,Jersey,"King George V Sports Ground, Castel"
3,1438262,2024,2024-06-16,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava"
4,1438259,2024,2024-06-15,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava"
5,1438257,2024,2024-06-14,Nordic T20 Cup,Norway,Finland,Finland,field,Norway,"Kerava National Cricket Ground, Kerava"
6,1438080,2024,2024-06-19,Estonia tour of Cyprus,Estonia,Cyprus,Estonia,bat,Estonia,"Happy Valley Ground, Episkopi"
7,1438079,2024,2024-06-19,Estonia tour of Cyprus,Estonia,Cyprus,Cyprus,field,Cyprus,"Happy Valley Ground, Episkopi"
8,1438078,2024,2024-06-18,Estonia tour of Cyprus,Estonia,Cyprus,Estonia,bat,Estonia,"Happy Valley Ground, Episkopi"
9,1438077,2024,2024-06-18,Estonia tour of Cyprus,Estonia,Cyprus,Estonia,field,Cyprus,"Happy Valley Ground, Episkopi"


### **Step 4: Head-to-Head Winning Count**

In [18]:
# Calculate total matches played and head-to-head wins for team1
summary['h2h'] = summary.groupby(['team1', 'team2'])['match_id'].transform('count')
h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')

# Merge calculated wins back to the original DataFrame
summary = summary.merge(h2h_wins, on=['team1', 'team2'], how='left')
summary["h2h_team1_wins%"] = round((summary["h2h_team1_wins"] / summary["h2h"]) * 100).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'team1', 'team2', 'h2h', 'h2h_team1_wins']]

  h2h_wins = summary.groupby(['team1', 'team2']).apply(lambda x: (x['winner'] == x['team1']).sum()).reset_index(name='h2h_team1_wins')


Unnamed: 0,match_id,team1,team2,h2h,h2h_team1_wins
0,1440131,Jersey,Guernsey,4,3
1,1440130,Jersey,Guernsey,4,3
2,1440129,Jersey,Guernsey,4,3
3,1438262,Norway,Finland,4,4
4,1438259,Norway,Finland,4,4
...,...,...,...,...,...
8047,225271,Sri Lanka,England,14,4
8048,225263,Pakistan,England,30,9
8049,222678,South Africa,New Zealand,15,11
8050,211048,New Zealand,Australia,19,6


### **Step 5: Head-to-Head Winning Count at a Venue**

In [19]:
# Standardize text data
summary['team1'] = summary['team1'].str.strip().str.title()
summary['team2'] = summary['team2'].str.strip().str.title()
summary['ground'] = summary['ground'].str.strip().str.title().fillna('Unknown')
summary['winner'] = summary['winner'].str.strip().str.title()

# Calculate matches played and wins at the same venue for team1
summary['h2h_venue'] = summary.groupby(['team1', 'team2', 'ground'])['match_id'].transform('count')
head_to_head = summary.groupby(['team1', 'team2', 'ground', 'winner']).size().unstack(fill_value=0)

# Create a column for wins by team1 at the same venue
def get_team1_wins(row):
    try:
        return head_to_head.loc[(row['team1'], row['team2'], row['ground']), row['team1']]
    except KeyError:
        return 0

summary['h2h_venue_team1_wins'] = summary.apply(get_team1_wins, axis=1)
summary['h2h_venue_team1_wins%'] = round((summary['h2h_venue_team1_wins'] / summary['h2h_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
summary[['match_id', 'match_date', 'team1', 'team2', 'ground', 'winner', 'h2h_venue', 'h2h_venue_team1_wins', 'h2h_venue_team1_wins%']].head()


Unnamed: 0,match_id,match_date,team1,team2,ground,winner,h2h_venue,h2h_venue_team1_wins,h2h_venue_team1_wins%
0,1440131,2024-06-23,Jersey,Guernsey,"Guernsey Rovers Athletic Club Ground, Port Soi...",Guernsey,1,0,0
1,1440130,2024-06-22,Jersey,Guernsey,"King George V Sports Ground, Castel",Jersey,2,2,100
2,1440129,2024-06-22,Jersey,Guernsey,"King George V Sports Ground, Castel",Jersey,2,2,100
3,1438262,2024-06-16,Norway,Finland,"Kerava National Cricket Ground, Kerava",Norway,3,3,100
4,1438259,2024-06-15,Norway,Finland,"Kerava National Cricket Ground, Kerava",Norway,3,3,100


In [20]:
norway = summary[((summary["team1"] == "Norway") & (summary["team2"] == "Finland") | (summary["team1"] == "Finland") & (summary["team2"] == "Norway")) 
                 & (summary["ground"] == "Kerava National Cricket Ground, Kerava")].reset_index(drop=True)
norway

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,h2h,h2h_team1_wins,h2h_team1_wins%,h2h_venue,h2h_venue_team1_wins,h2h_venue_team1_wins%
0,1438262,2024,2024-06-16,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava",4,4,100,3,3,100
1,1438259,2024,2024-06-15,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava",4,4,100,3,3,100
2,1438257,2024,2024-06-14,Nordic T20 Cup,Norway,Finland,Finland,field,Norway,"Kerava National Cricket Ground, Kerava",4,4,100,3,3,100


### **Step 6: Team Wins at a Venue**

In [21]:
import pandas as pd

# Assuming 'summary' is your DataFrame
# And 'head_to_head' is a DataFrame with team wins data

# Calculate matches played at the same venue for team1 and team2
summary['team1_venue'] = summary.groupby(['team1', 'ground'])['match_id'].transform('count')
summary['team2_venue'] = summary.groupby(['team2', 'ground'])['match_id'].transform('count')

# Function to get team wins
def get_team_wins(row, team_col):
    try:
        return head_to_head.loc[(row['team1'], row['team2'], row['ground']), row[team_col]]
    except KeyError:
        return 0
    except:
        return 0

# Apply the function to calculate wins at the venue
summary['team1_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team1'), axis=1)
summary['team2_venue_wins'] = summary.apply(lambda row: get_team_wins(row, 'team2'), axis=1)

# Calculate win percentages
summary['team1_venue_wins%'] = round((summary['team1_venue_wins'] / summary['team1_venue']) * 100, 2).fillna(0).astype(int)
summary['team2_venue_wins%'] = round((summary['team2_venue_wins'] / summary['team2_venue']) * 100, 2).fillna(0).astype(int)

# Display the updated DataFrame
display_cols = ['match_id', 'match_date', 'team1', 'team2', 'team1_venue_wins', 'team1_venue_wins%', 'team2_venue_wins', 'team2_venue_wins%', 'ground', 'winner']
summary[display_cols].head()


Unnamed: 0,match_id,match_date,team1,team2,team1_venue_wins,team1_venue_wins%,team2_venue_wins,team2_venue_wins%,ground,winner
0,1440131,2024-06-23,Jersey,Guernsey,0,0,1,100,"Guernsey Rovers Athletic Club Ground, Port Soi...",Guernsey
1,1440130,2024-06-22,Jersey,Guernsey,2,50,0,0,"King George V Sports Ground, Castel",Jersey
2,1440129,2024-06-22,Jersey,Guernsey,2,50,0,0,"King George V Sports Ground, Castel",Jersey
3,1438262,2024-06-16,Norway,Finland,3,60,0,0,"Kerava National Cricket Ground, Kerava",Norway
4,1438259,2024-06-15,Norway,Finland,3,60,0,0,"Kerava National Cricket Ground, Kerava",Norway


### **Step 7: Last 5 Matches Performance**

In [22]:
import pandas as pd

# Function to calculate last 5 matches wins for each match
def calculate_last_5_wins(df):
    df['team1_last_5_wins'] = 0
    df['team2_last_5_wins'] = 0
    
    teams = set(df['team1']).union(set(df['team2']))
    for team in teams:
        team_matches = df[(df['team1'] == team) | (df['team2'] == team)].sort_values(by=['match_date', 'match_id'], ascending=False)
        wins_count = []
        for i in range(len(team_matches)):
            match = team_matches.iloc[i]
            prev_matches = team_matches.iloc[i+1:i+6]  # next 5 matches in descending order
            wins = len(prev_matches[prev_matches['winner'] == team])
            wins_count.append(wins)
        
        # Ensure wins_count is aligned with the correct matches in the main DataFrame
        df.loc[df[df['team1'] == team].index, 'team1_last_5_wins'] = wins_count[:len(df[df['team1'] == team])]
        df.loc[df[df['team2'] == team].index, 'team2_last_5_wins'] = wins_count[:len(df[df['team2'] == team])]

    return df

# Calculate last 5 matches wins for each match
summary = calculate_last_5_wins(summary)

# Display the DataFrame
summary[['match_id', 'team1', 'team2', 'match_date', 'team1_last_5_wins', 'team2_last_5_wins']][:5]


Unnamed: 0,match_id,team1,team2,match_date,team1_last_5_wins,team2_last_5_wins
0,1440131,Jersey,Guernsey,2024-06-23,4,2
1,1440130,Jersey,Guernsey,2024-06-22,3,3
2,1440129,Jersey,Guernsey,2024-06-22,3,4
3,1438262,Norway,Finland,2024-06-16,3,1
4,1438259,Norway,Finland,2024-06-15,3,1


### **Step 8: Event Name Standardization**

In [23]:
# Standardize event column based on team participation
team_event_counts = summary.groupby(['team1', 'team2'])['event_name'].nunique().reset_index()
multiple_events_teams = team_event_counts[team_event_counts['event_name'] > 1]
teams_in_multiple_events = set(multiple_events_teams['team1']).union(set(multiple_events_teams['team2']))

# List of events to be considered as T20I
t20i_events = [
    'T20I',
    'Central American Cricket Championships',
    'South American Men\'s Championships'
]

# Apply the condition to set the 'event' column
if match_type == "T20":
    summary['event'] = summary.apply(lambda row: 'T20I' if row['event_name'] in t20i_events or row['team1'] in teams_in_multiple_events or row['team2'] 
                                     in teams_in_multiple_events else row['event_name'], axis=1)

# Check if the match type is "ODI"
elif match_type == "ODI":
    summary['event'] = summary.apply(lambda row: 'ODI' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Check if the match type is "ODM"
elif match_type == "ODM":
    summary['event'] = summary.apply(lambda row: 'ODM' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Check if the match type is "MDM"
elif match_type == "MDM":
    summary['event'] = summary.apply(lambda row: 'MDM' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# If none of the above match types, assume it is a Test match
elif match_type == "Test":
    summary['event'] = summary.apply(lambda row: 'Test' if row['team1'] in teams_in_multiple_events or row['team2'] in teams_in_multiple_events else row['event_name'],axis=1)

# Display the first 10 rows of the summary DataFrame to check the updates
summary[:5]


Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,...,h2h_venue_team1_wins%,team1_venue,team2_venue,team1_venue_wins,team2_venue_wins,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,event
0,1440131,2024,2024-06-23,Jersey tour of Guernsey,Jersey,Guernsey,Guernsey,field,Guernsey,"Guernsey Rovers Athletic Club Ground, Port Soi...",...,0,1,1,0,1,0,100,4,2,T20I
1,1440130,2024,2024-06-22,Jersey tour of Guernsey,Jersey,Guernsey,Jersey,field,Jersey,"King George V Sports Ground, Castel",...,100,4,3,2,0,50,0,3,3,T20I
2,1440129,2024,2024-06-22,Jersey tour of Guernsey,Jersey,Guernsey,Jersey,field,Jersey,"King George V Sports Ground, Castel",...,100,4,3,2,0,50,0,3,4,T20I
3,1438262,2024,2024-06-16,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava",...,100,5,8,3,0,60,0,3,1,T20I
4,1438259,2024,2024-06-15,Nordic T20 Cup,Norway,Finland,Finland,bat,Norway,"Kerava National Cricket Ground, Kerava",...,100,5,8,3,0,60,0,3,1,T20I


In [24]:
summary['event'].unique()

array(['T20I', 'Cricket Ireland Inter-Provincial Twenty20 Trophy', 'IPL',
       'Vitality Blast', 'Pakistan Super League',
       'Bangladesh Premier League', 'Super Smash',
       'International League T20', 'CSA T20 Challenge', 'SA20',
       'Big Bash League', 'Syed Mushtaq Ali Trophy',
       'Lanka Premier League', 'Major League Cricket',
       'Caribbean Premier League', "The Hundred Men's Competition",
       'Mzansi Super League', 'Afghanistan Premier League'], dtype=object)

In [25]:
summary[summary['event'] == "South American Men's Championships"]

Unnamed: 0,match_id,season,match_date,event_name,team1,team2,toss_winner,toss_decision,winner,ground,...,h2h_venue_team1_wins%,team1_venue,team2_venue,team1_venue_wins,team2_venue_wins,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,event


### **Step 9: Capitalizing IPL teams**

In [26]:
# Create a mask for rows where event_name is "IPL"
mask = summary["event_name"] == "IPL"

# Convert the specified columns to uppercase only for the rows where event_name is "IPL"
summary.loc[mask, 'team1'] = summary.loc[mask, 'team1'].str.upper()
summary.loc[mask, 'team2'] = summary.loc[mask, 'team2'].str.upper()
summary.loc[mask, 'toss_winner'] = summary.loc[mask, 'toss_winner'].str.upper()
summary.loc[mask, 'winner'] = summary.loc[mask, 'winner'].str.upper()

### **Step 10: Save Cleaned DataFrame**

In [27]:
# Select and reorder final columns
summary_df = summary[['match_id', 'season', 'event_name', 'event', 'match_date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'h2h_team1_wins%', 'h2h_venue_team1_wins%', 'team1_venue_wins%', 'team2_venue_wins%', 'team1_last_5_wins', 'team2_last_5_wins', 'ground']].copy()

# Save the cleaned summary DataFrame to CSV
summary_df.to_csv(fr"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\{gender}\{match_type}_{gender}_match_summary_cleaned.csv", index=False)

summary_df.head()

Unnamed: 0,match_id,season,event_name,event,match_date,team1,team2,toss_winner,toss_decision,winner,h2h_team1_wins%,h2h_venue_team1_wins%,team1_venue_wins%,team2_venue_wins%,team1_last_5_wins,team2_last_5_wins,ground
0,1440131,2024,Jersey tour of Guernsey,T20I,2024-06-23,Jersey,Guernsey,Guernsey,field,Guernsey,75,0,0,100,4,2,"Guernsey Rovers Athletic Club Ground, Port Soi..."
1,1440130,2024,Jersey tour of Guernsey,T20I,2024-06-22,Jersey,Guernsey,Jersey,field,Jersey,75,100,50,0,3,3,"King George V Sports Ground, Castel"
2,1440129,2024,Jersey tour of Guernsey,T20I,2024-06-22,Jersey,Guernsey,Jersey,field,Jersey,75,100,50,0,3,4,"King George V Sports Ground, Castel"
3,1438262,2024,Nordic T20 Cup,T20I,2024-06-16,Norway,Finland,Finland,bat,Norway,100,100,60,0,3,1,"Kerava National Cricket Ground, Kerava"
4,1438259,2024,Nordic T20 Cup,T20I,2024-06-15,Norway,Finland,Finland,bat,Norway,100,100,60,0,3,1,"Kerava National Cricket Ground, Kerava"
