### Install libraries


In [98]:
!pip3 install python-espncricinfo
!pip3 install bs4



### Import Libraries

In [99]:
import json
import pandas as pd
import numpy as np
import requests 
import re
from bs4 import BeautifulSoup as bs
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
import dateparser
from espncricinfo.exceptions import PlayerNotFoundError
from datetime import datetime,date
import os




class Match(object):

    def __init__(self, match_id):
        self.match_id = match_id
        self.match_url = "https://www.espncricinfo.com/matches/engine/match/{0}.html".format(str(match_id))
        self.json_url = "https://www.espncricinfo.com/matches/engine/match/{0}.json".format(str(match_id))
        self.json = self.get_json()
        self.html = self.get_html()
        if self.json:
           
            self.series = self._series()
            
            self.team_1 = self._team_1()

            self.team_2 = self._team_2()
            
            self.team_1_players = self._team_1_players()
            
            self.team_2_players = self._team_2_players()
            
            self.date = self._date()
            

    def __str__(self):
        return self.description

    def __repr__(self):
        return (f'{self.__class__.__name__}('f'{self.match_id!r})')

    def get_json(self):
        r = requests.get(self.json_url)
        if r.status_code == 404:
            raise MatchNotFoundError
        elif 'Scorecard not yet available' in r.text:
            raise NoScorecardError
        else:
            return r.json()
        
    def match_json(self):
        return self.json['match']

    def get_html(self):
        r = requests.get(self.match_url)
        if r.status_code == 404:
            raise MatchNotFoundError
        else:
            return bs(r.text, 'html.parser')

   
    def _series(self):
        return self.json['series']

    def _team_1(self):
        return self.json['team'][0]

    def _team_2(self):
        return self.json['team'][1]

    def _team_1_players(self):
        return self._team_1().get('player', [])
    
    def _team_2_players(self):
        return self._team_2().get('player', [])
    
    def _date(self):        
        return self.match_json()['start_date_raw']





In [100]:
def scorecard(index,table,m):
    tableValues = []

    for x in table[index].find_all('tr')[1:]:
        td_tags = x.find_all('td')
        td_val = [y.text for y in td_tags]
        tableValues.append(td_val)

    row_headers = []

    for x in table[index].find_all('tr'):
        for y in x.find_all('th'):
            row_headers.append(y.text)


    df=pd.DataFrame(tableValues,columns=row_headers)
    df.columns.values[1] ='CB'
    df=df.drop("\xa0",axis=1,errors='ignore')

    if df['BATTING'].str.contains("Did not bat").any():    

        df_clean=df.set_index('BATTING').dropna(axis=0,how='all')
        df_clean=df_clean.iloc[0:df_clean.index.get_indexer(['Extras'])[0]].reset_index().rename(columns={'\xa0':'CB'})
        dnb=df['BATTING'][df['BATTING'].str.contains("Did not bat")==True].str.replace(":",",").str.split(",",expand=True).drop([0],axis=1).T
        dnb.columns=['BATTING']
        df_dnb=pd.concat([df_clean,dnb],axis=0).reset_index().drop("index",axis=1)
        df_dnb['CB']=df_dnb['CB'].fillna('dnb')
        df_dnb=df_dnb.fillna(0)
        df=df_dnb
    else:
        df=pd.DataFrame(tableValues,columns=row_headers).set_index('BATTING').dropna(axis=0,how='all')
        df=df.iloc[0:df.index.get_indexer(['Extras'])[0]].reset_index().rename(columns={'\xa0':'CB'})

    df['BATTING']=df['BATTING'].replace(regex=True,to_replace=r'[^A-Z a-z ().\-]', value=r'')

    if len(df['CB'].replace(regex=True,to_replace=["c ",r'[^A-Z a-z (/ .\-]'], value=r'').str.replace("(","-").str.split(" b ",expand=True).columns)>1:

        df[['caught','bowled']]=df['CB'].replace(regex=True,to_replace=["c ",r'[^A-Z a-z (/ .\-]'], value=r'').str.replace("(","-").str.split(" b ",expand=True)

        #Split data for run_out

        df1=df.caught.str.split("run out -",expand=True)

        if len(df1.columns) > 1:

            df1=df1[1].str.split("/",expand=True)

            if len(df1.columns)==2:
                df[['run_out_1','run_out_2']]=df1
                run_out_id=list(df["caught"].apply(lambda x: x if "run out -" in x else None).dropna().index)
                df["caught"][run_out_id]=df["caught"][run_out_id].replace(df["caught"][run_out_id],'None')
            elif len(df1.columns)==1:
                df['run_out_dh']=df1
                run_out_id=list(df["caught"].apply(lambda x: x if "run out -" in x else None).dropna().index)
                df["caught"][run_out_id]=df["caught"][run_out_id].replace(df["caught"][run_out_id],'None')
            else:
                None
        else:
            None



        ## Stumped

        if len(df.caught.str.split("st ",expand=True).columns)>1:
            df['stumped_out_by']=df.caught.str.split("st ",expand=True)[1]
            st_id=list(df["caught"].apply(lambda x: x if "st " in x else None).dropna().index)
            df["caught"][st_id]=df["caught"][st_id].replace(df["caught"][st_id],'None')
        else:
            None


        ## Hit - Wicket

        if len(df.caught.str.split("hit wicket ",expand=True).columns)>1:
            df['hw']=df.caught.str.split("hit wicket ",expand=True)[1]
            hw_id=list(df["caught"].apply(lambda x: x if "hit wicket " in x else None).dropna().index)
            df["caught"][hw_id]=df["caught"][hw_id].replace(df["caught"][hw_id],'None')
        else:
            None


        ## Substitute Involved

        if len(df.caught.str.split("sub -",expand=True).columns)>1:
            df['catch_by_substitute']=df.caught.str.split("sub -",expand=True)[1]
            sub_id=list(df["caught"].apply(lambda x: x if "sub -" in x else None).dropna().index)
            df["caught"][sub_id]=df["caught"][sub_id].replace(df["caught"][sub_id],'None')
        else:
            None


        ## lbw 

        if len(df.CB.str.replace("b ","").str.split("lbw ",expand=True).columns)>1:
                df['lbw_by']=df.CB.str.replace("b ","").str.split("lbw ",expand=True)[1]
                lbw_id=list(df["caught"].apply(lambda x: x if "lbw" in x else None).dropna().index)
                df["bowled"][lbw_id]=df["bowled"][lbw_id].replace(df["bowled"][lbw_id],'None')
                df["caught"][lbw_id]=df["caught"][lbw_id].replace(df["caught"][lbw_id],'None')
        else:
            None


    else:
        None


    df['match_date']=m.date

    df=df.replace(' ', 'None')

    return df

# Impact list


### Total Impact for a player in a match is a numerical value which is the sum of his Batting and Bowling Impacts. These Impacts are calculated based on the context of a batting/bowling performance.

### Context is based on an intelligent algorithm that quantifies the pressure on the batsman/bowler at every ball of an innings. This is the Pressure Index (PI) value for each ball.

### The factors which go into calculating PI include: runs required; overs remaining; quality of batsmen at the crease and those to follow; quality of bowlers and number of overs left for each; pitch/conditions, and how easy/tough it is for batsmen/bowlers.

### The PI value is always between O and 1. The closer it is to 1, the higher is the pressure on the batsman. (The converse is true for the bowler.)

### The Batting/Bowling Impact is thus a factor not only of the runs scored/wickets taken/economy rate, but also of the pressure under which these performances happened.

### While extra credit for match-winning performances are organically built into the algorithm, it is not unusual for a stand-out performance in a losing cause to be the most Impactful in a match.

In [101]:
def impact_player(index,table):    
    tableValues = []
    for x in table[index].find_all('tr')[1:]:
            td_tags = x.find_all('td')
            td_val = [y.text for y in td_tags]
            tableValues.append(td_val)


    row_headers = []
    for x in table[index].find_all('tr'):
        for y in x.find_all('th'):
            row_headers.append(y.text)
    
    df=pd.DataFrame(tableValues,columns=row_headers)
    return df

In [102]:
def matches(year=str(datetime.today().year)):
    
    #Get all match_ids with their links using Web scraping method from ESPN website
        
    url=requests.get("https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=3;id={};team=6;type=year".format(year))
    soup = bs(url.content,"html.parser")
    match_links=[]
    match_id=[]
    for tag in soup.find_all('a',{'href': re.compile("/ci/engine/match"), 'class' : 'data-link'}):
           match_links.append("https://stats.espncricinfo.com/"+tag['href'])
           match_id.append(re.findall('[0-9]+', tag['href'])[0])
    return match_links,match_id

In [103]:
def match_data(match_id):
    
    #Get Match_details needed for web Url
    
    m=Match(match_id)
    d=pd.DataFrame(m.series)[['match_number','slug','object_id']]
    d['match_number'].fillna("name",inplace=True)
    series_name=d[d['match_number'].astype(str)=="name"].reset_index()['slug'][0]
    name=pd.DataFrame(m.team_1)['url_component'].unique()[0]+"-vs-"+pd.DataFrame(m.team_2)['url_component'].unique()[0]

    #Scraping ESPN
    
    web = requests.get("https://www.espncricinfo.com/series/{}/{}-{}/full-scorecard".format(series_name,name,match_id))
    soup = bs(web.content,"html.parser")
    table=soup.find_all('table',{'class':'ci-scorecard-table'})
    team1_name=str(soup.find_all('span',{'class':'ds-text-tight-l ds-font-bold ds-text-ui-typo hover:ds-text-ui-typo-primary ds-block ds-truncate'})[0]).replace(">","<").split("<")[2]
    team2_name=str(soup.find_all('span',{'class':'ds-text-tight-l ds-font-bold ds-text-ui-typo hover:ds-text-ui-typo-primary ds-block ds-truncate'})[1]).replace(">","<").split("<")[2]
    
    result=str(soup.find_all('p',{'class':'ds-text-tight-m ds-font-regular ds-truncate ds-text-typo-title'})[0]).replace(">","<").split("<")[4]
    
    # AVoiding No result matches
    
    if (result!="No result") or (result!="Match Abandoned"):    
        df1=scorecard(index=0,table=table,m=m)
        df1['team']=team1_name
        df1['BATTING']=df1['BATTING'].str.replace(")","").str.replace("(","-").str.replace("-c","").replace(regex=True,to_replace=r'[^A-Z a-z .\-]', value=r'')
        df1['recent_no_of_days']=pd.to_datetime(df1['match_date']).apply(lambda x: (date.today()-x.date()).days)

        df2=scorecard(index=1,table=table,m=m)
        df2['team']=team2_name
        df2['BATTING']=df2['BATTING'].str.replace(")","").str.replace("(","-").str.replace("-c","").replace(regex=True,to_replace=r'[^A-Z a-z .\-]', value=r'')
        df2['recent_no_of_days']=pd.to_datetime(df2['match_date']).apply(lambda x: (date.today()-x.date()).days)

        # Concat both the scorecards

        scorecard_all=pd.concat([df1,df2],axis=0).reset_index().drop(['index'],axis=1).replace(np.NaN,"None")
        scorecard_all['BATTING']=scorecard_all['BATTING'].apply(lambda x:" ".join(x.split()))
        # Players info

        ### Team-1

        player_info_1=pd.DataFrame(m.team_1_players)[['age_days','age_years','keeper','known_as','player_id','object_id','player_primary_role','popular_name','alpha_name']]
        player_info_1['batsmen']=player_info_1['player_primary_role'].apply(lambda x: 0 if 'wicketkeeper' in x else 1 if 'batter' in x else 0)
        player_info_1['allrounder']=player_info_1['player_primary_role'].apply(lambda x: 1 if 'allrounder' in x else 0)
        player_info_1['bowler']=player_info_1['player_primary_role'].apply(lambda x: 1 if 'bowler' in x else 0)

        ### Team-2

        player_info_2=pd.DataFrame(m.team_2_players)[['age_days','age_years','keeper','known_as','player_id','object_id','player_primary_role','popular_name','alpha_name']]
        player_info_2['batsmen']=player_info_2['player_primary_role'].apply(lambda x: 0 if 'wicketkeeper' in x else 1 if 'batter' in x else 0)
        player_info_2['allrounder']=player_info_2['player_primary_role'].apply(lambda x: 1 if 'allrounder' in x else 0)
        player_info_2['bowler']=player_info_2['player_primary_role'].apply(lambda x: 1 if 'bowler' in x else 0)

        # Concat both the teams player info

        players_info=pd.concat([player_info_1,player_info_2],axis=0).reset_index().drop("index",axis=1)
        players_info['known_as']=players_info['known_as'].apply(lambda x:" ".join(x.split()))


        #Impact-metric

        web = requests.get("https://www.espncricinfo.com/series/{}/{}-{}/match-impact-player".format(series_name,name,match_id))
        soup = bs(web.content,"html.parser")
        table=soup.find_all('table',{'class':'ds-w-full ds-table ds-table-md ds-table-auto'})

        if len(table)>0:

            impact_df=impact_player(index=0,table=table)

            # Merge all the data
            data_final=pd.merge(players_info,scorecard_all,how="left",right_on="BATTING",left_on="known_as").merge(impact_df,left_on="known_as",right_on="PLAYER",how="left")
            data_final['match_id']=match_id
        else:
            # Merge all the data
            data_final=pd.merge(players_info,scorecard_all,how="left",right_on="BATTING",left_on="known_as")
            data_final['match_id']=match_id
    else:
        scorecard_all=pd.DataFrame()
        players_info=pd.DataFrame()
        data_final=pd.DataFrame()

    
    
    return scorecard_all,players_info,data_final

In [104]:
df_combined=pd.DataFrame()

for year in ['2021','2022']:
    match_links,match_id=matches(year)
    matches_db=pd.DataFrame(list(zip(match_links,match_id)),columns=['match_url','match_id'])
    
    #Create folder w.r.t year
    
    path=input()
    
    if os.path.exists(path):
        None
    else:
        os.mkdir(path) 
    
    matches_db.to_csv(path+"\matches_db_{}.csv".format(year))
    
    for m_id in match_id:
        print(m_id)
        scorecard_match,players_info,data_final=match_data(m_id)
        
        scorecard_match.to_csv(path+'\scorecard_{}.csv'.format(m_id))
        players_info.to_csv(path+'\players_info_{}.csv'.format(m_id))
        data_final.to_csv(path+'\data_final_{}.csv'.format(m_id)) 
        
        #combine all matches data into one csv file
        
        df_combined=pd.concat([data_final,df_combined],axis=0).reset_index().drop("index",axis=1)
        
    df_combined.to_csv(path+'\df_combined_{}.csv'.format(year))     

1243388
1243389
1243390
1243391
1243392
1262758
1262759
1262760
1273727
1273739
1273744
1273748
1273753
1278671
1278672
1278673
1278679
1278680
1278681
1278684
1278685
1278686
1278687
1278688
1278689
1278690
1278691
1303307
1303308
1276904
1276905
1276906
1317903
1317904
1317905
1317906
1317907
1327270
1327272
1327276
1327277
1327279
1327503
1327504
1327505
1327506
1327507
1327508
