In [1]:
import math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [2]:
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

In [3]:
team_mapping = {
 'Chennai Super Kings' : 'CSK', 'Deccan Chargers' : 'DEC', 'Delhi Capitals' : 'DC', 'Delhi Daredevils' : 'DC',
 'Gujarat Lions' : 'GL', 'Gujarat Titans' : 'GT', 'Kings XI Punjab' : 'PBKS', 'Kochi Tuskers Kerala' : 'KTK',
 'Kolkata Knight Riders' : 'KKR', 'Lucknow Super Giants' : 'LSG', 'Mumbai Indians' : 'MI', 'Pune Warriors' : 'PWI',
 'Punjab Kings' : 'PBKS', 'Rajasthan Royals' : 'RR', 'Rising Pune Supergiant' : 'RPSG', 
 'Rising Pune Supergiants' : 'RPSG', 'Royal Challengers Bangalore' : 'RCB', 'Sunrisers Hyderabad' : 'SRH'
}

In [4]:
matches['season'] = matches['date'].str[:4]
matches['city'].replace({'Bangalore' : 'Bengaluru'}, inplace=True)
matches[['team1', 'team2', 'toss_winner', 'winner']].replace(team_mapping, inplace=True)
venue_mapping = {'Sharjah Cricket Stadium' : 'Sharjah', 'Dubai International Cricket Stadium' : 'Dubai'}
matches['venue'].replace(venue_mapping, inplace=True)
matches['city'] = matches['city'].fillna(matches['venue'])
matches = matches.drop(columns = ['date', 'venue', 'umpire1', 'umpire2'], axis=1)

In [5]:
matches['team1'].replace(team_mapping, inplace=True)
matches['team2'].replace(team_mapping, inplace=True)
matches['toss_winner'].replace(team_mapping, inplace=True)
matches['winner'].replace(team_mapping, inplace=True)
matches

Unnamed: 0,id,season,city,match_type,player_of_match,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method
0,335982,2008,Bengaluru,League,BB McCullum,RCB,KKR,RCB,field,KKR,runs,140.0,223.0,20.0,N,
1,335983,2008,Chandigarh,League,MEK Hussey,PBKS,CSK,CSK,bat,CSK,runs,33.0,241.0,20.0,N,
2,335984,2008,Delhi,League,MF Maharoof,DC,RR,RR,bat,DC,wickets,9.0,130.0,20.0,N,
3,335985,2008,Mumbai,League,MV Boucher,MI,RCB,MI,bat,RCB,wickets,5.0,166.0,20.0,N,
4,335986,2008,Kolkata,League,DJ Hussey,KKR,DEC,DEC,bat,KKR,wickets,5.0,111.0,20.0,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,1359544,2023,Bengaluru,League,Shubman Gill,RCB,GT,GT,field,GT,wickets,6.0,198.0,20.0,N,
1020,1370350,2023,Chennai,Qualifier 1,RD Gaikwad,CSK,GT,GT,field,CSK,runs,15.0,173.0,20.0,N,
1021,1370351,2023,Chennai,Eliminator,Akash Madhwal,MI,LSG,MI,bat,MI,runs,81.0,183.0,20.0,N,
1022,1370352,2023,Ahmedabad,Qualifier 2,Shubman Gill,GT,MI,MI,field,GT,runs,62.0,234.0,20.0,N,


In [6]:
mapping = {'Qualifier 1' : 'Playoffs', 'Qualifier 2' : 'Playoffs', 'Eliminator' : 'Playoffs', 'Semi Final' : 'Playoffs', 'Elimination Final' : 'Playoffs', '3rd Place Play-Off' : 'Playoffs', 'Final' : 'Playoffs'}
matches['match_type'].replace(mapping, inplace = True)

In [7]:
matches = matches[matches['target_overs']==20]
matches = matches[matches['result']!='no result']

In [8]:
matches['result_margin'].fillna(0, inplace=True)
matches

Unnamed: 0,id,season,city,match_type,player_of_match,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method
0,335982,2008,Bengaluru,League,BB McCullum,RCB,KKR,RCB,field,KKR,runs,140.0,223.0,20.0,N,
1,335983,2008,Chandigarh,League,MEK Hussey,PBKS,CSK,CSK,bat,CSK,runs,33.0,241.0,20.0,N,
2,335984,2008,Delhi,League,MF Maharoof,DC,RR,RR,bat,DC,wickets,9.0,130.0,20.0,N,
3,335985,2008,Mumbai,League,MV Boucher,MI,RCB,MI,bat,RCB,wickets,5.0,166.0,20.0,N,
4,335986,2008,Kolkata,League,DJ Hussey,KKR,DEC,DEC,bat,KKR,wickets,5.0,111.0,20.0,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,1359543,2023,Mumbai,League,C Green,SRH,MI,MI,field,MI,wickets,8.0,201.0,20.0,N,
1019,1359544,2023,Bengaluru,League,Shubman Gill,RCB,GT,GT,field,GT,wickets,6.0,198.0,20.0,N,
1020,1370350,2023,Chennai,Playoffs,RD Gaikwad,CSK,GT,GT,field,CSK,runs,15.0,173.0,20.0,N,
1021,1370351,2023,Chennai,Playoffs,Akash Madhwal,MI,LSG,MI,bat,MI,runs,81.0,183.0,20.0,N,


In [9]:
#cols = matches.columns.select_dtypes(include=['number'])
cols = matches.select_dtypes(include=['float']).columns
matches[cols] =matches[cols].astype(int)

In [10]:
matches

Unnamed: 0,id,season,city,match_type,player_of_match,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method
0,335982,2008,Bengaluru,League,BB McCullum,RCB,KKR,RCB,field,KKR,runs,140,223,20,N,
1,335983,2008,Chandigarh,League,MEK Hussey,PBKS,CSK,CSK,bat,CSK,runs,33,241,20,N,
2,335984,2008,Delhi,League,MF Maharoof,DC,RR,RR,bat,DC,wickets,9,130,20,N,
3,335985,2008,Mumbai,League,MV Boucher,MI,RCB,MI,bat,RCB,wickets,5,166,20,N,
4,335986,2008,Kolkata,League,DJ Hussey,KKR,DEC,DEC,bat,KKR,wickets,5,111,20,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,1359543,2023,Mumbai,League,C Green,SRH,MI,MI,field,MI,wickets,8,201,20,N,
1019,1359544,2023,Bengaluru,League,Shubman Gill,RCB,GT,GT,field,GT,wickets,6,198,20,N,
1020,1370350,2023,Chennai,Playoffs,RD Gaikwad,CSK,GT,GT,field,CSK,runs,15,173,20,N,
1021,1370351,2023,Chennai,Playoffs,Akash Madhwal,MI,LSG,MI,bat,MI,runs,81,183,20,N,


In [11]:
### MODIFY DATAFRAME
#Id       : As it is
#Features : Team1, Team2, PlayerOfTheMatch(Remove spaces), City, Season, Match_Type, Toss_Decision, Result (Direct Value in List)
#           Toss : "Toss" + Team Name 
#           Winner : Team + "wins"
#           If Super Over==Y, Add 'Super Over' in features
#           IF Method == D/L, Add 'D/L' in features
#           If Result=='runs', Add Result_Margin + "Runs"
#           If Result=='wickets', Add Result_Margin + "Wickets"
#Target Runs : Separate Column (No Changes)

#Keep this 3 columns in final df, 

In [12]:
df  = pd.DataFrame(matches)

In [13]:
df['PlayerOfTheMatch'] = df['player_of_match'].str.replace(' ','')
df['Toss'] = "Toss" + df['toss_winner']
df['Winner'] = df['winner'] + "wins"
df['Result'] = df.apply(lambda x: str(x['result_margin']) + "Runs" if x['result'] == 'runs' else str(x['result_margin']) + "Wickets", axis=1)

In [14]:
df['Features'] = df[['team1', 'team2', 'PlayerOfTheMatch', 'city', 'season', 'match_type', 'toss_decision']].astype(str).agg(' '.join, axis=1)
df['Features'] = df['Features'] + ' ' + df['Toss'] + ' ' + df['Winner'] + ' ' + df['Result']

In [15]:
df.loc[df['super_over'] == 'Y', 'Features'] = df['Features'] + ' Super Over'
df.loc[df['method'] == 'D/L', 'Features'] = df['Features'] + ' D/L'

In [16]:
final_df = df[['id', 'Features', 'target_runs']]

In [17]:
final_df

Unnamed: 0,id,Features,target_runs
0,335982,RCB KKR BBMcCullum Bengaluru 2008 League field...,223
1,335983,PBKS CSK MEKHussey Chandigarh 2008 League bat ...,241
2,335984,DC RR MFMaharoof Delhi 2008 League bat TossRR ...,130
3,335985,MI RCB MVBoucher Mumbai 2008 League bat TossMI...,166
4,335986,KKR DEC DJHussey Kolkata 2008 League bat TossD...,111
...,...,...,...
1018,1359543,SRH MI CGreen Mumbai 2023 League field TossMI ...,201
1019,1359544,RCB GT ShubmanGill Bengaluru 2023 League field...,198
1020,1370350,CSK GT RDGaikwad Chennai 2023 Playoffs field T...,173
1021,1370351,MI LSG AkashMadhwal Chennai 2023 Playoffs bat ...,183


In [18]:
feature = df['Features']

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')  

In [20]:
cv.fit_transform(final_df['Features'])

<991x489 sparse matrix of type '<class 'numpy.int64'>'
	with 9990 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
vector = cv.fit_transform(final_df['Features']).toarray()

In [22]:
similarity = cosine_similarity(vector)

In [23]:
ipl = matches[matches['season']=='2019']
ipl[ipl['city']=='Hyderabad']

Unnamed: 0,id,season,city,match_type,player_of_match,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method
703,1175363,2019,Hyderabad,League,Rashid Khan,RR,SRH,RR,bat,SRH,wickets,5,199,20,N,
706,1175366,2019,Hyderabad,League,JM Bairstow,SRH,RCB,RCB,field,SRH,runs,118,232,20,N,
714,1178394,2019,Hyderabad,League,AS Joseph,MI,SRH,SRH,field,MI,runs,40,137,20,N,
725,1178405,2019,Hyderabad,League,KMA Paul,DC,SRH,SRH,field,DC,runs,39,156,20,N,
728,1178408,2019,Hyderabad,League,DA Warner,CSK,SRH,CSK,bat,SRH,wickets,6,133,20,N,
733,1178413,2019,Hyderabad,League,KK Ahmed,KKR,SRH,SRH,field,SRH,wickets,9,160,20,N,
743,1178423,2019,Hyderabad,League,DA Warner,SRH,PBKS,PBKS,field,SRH,runs,45,213,20,N,
755,1181768,2019,Hyderabad,Playoffs,JJ Bumrah,MI,CSK,MI,bat,MI,runs,1,150,20,N,


In [24]:
def recommend(matchid):
    idx = final_df[final_df['id'] == matchid].index[0]
    loc = final_df.index.get_loc(idx)
    distances = sorted(list(enumerate(similarity[loc])),reverse=True,key = lambda x: x[1])
    locations = [loc]
    for i in distances[1:6]:
        locations.append(i[0])
    print("Similar matches to the first are : ")
    return matches.iloc[locations]

In [30]:
recommend(1178405)

Similar matches to the first are : 


Unnamed: 0,id,season,city,match_type,player_of_match,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method
725,1178405,2019,Hyderabad,League,KMA Paul,DC,SRH,SRH,field,DC,runs,39,156,20,N,
558,980983,2016,Hyderabad,League,CH Morris,SRH,DC,DC,field,DC,wickets,7,147,20,N,
711,1175371,2019,Delhi,League,JM Bairstow,DC,SRH,SRH,field,SRH,wickets,5,130,20,N,
714,1178394,2019,Hyderabad,League,AS Joseph,MI,SRH,SRH,field,MI,runs,40,137,20,N,
733,1178413,2019,Hyderabad,League,KK Ahmed,KKR,SRH,SRH,field,SRH,wickets,9,160,20,N,
925,1304096,2022,Mumbai,League,DA Warner,DC,SRH,SRH,field,DC,runs,21,208,20,N,
