# Data Extraction  

- The actual scorecard for the match below can be viewed at [Cricinfo webpage.](https://www.espncricinfo.com/series/8048/scorecard/335982/royal-challengers-bangalore-vs-kolkata-knight-riders-1st-match-indian-premier-league-2007-08)

- The ball-by-ball data can be downloaded from (https://cricsheet.org/downloads/).



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml

from utils import get_match_list

data_dir='../datasets/ipl/yaml/'

In [2]:
f = '335982.yaml'
fil=os.path.join(data_dir,  f )
data=yaml.load(open(fil))

In [3]:
print ( data.keys() )
print ()
print ( data['meta'] )
print ()
data['info']

dict_keys(['meta', 'info', 'innings'])

{'data_version': 0.9, 'created': datetime.date(2011, 5, 6), 'revision': 2}



{'city': 'Bangalore',
 'competition': 'IPL',
 'dates': [datetime.date(2008, 4, 18)],
 'gender': 'male',
 'match_type': 'T20',
 'outcome': {'by': {'runs': 140}, 'winner': 'Kolkata Knight Riders'},
 'overs': 20,
 'player_of_match': ['BB McCullum'],
 'teams': ['Royal Challengers Bangalore', 'Kolkata Knight Riders'],
 'toss': {'decision': 'field', 'winner': 'Royal Challengers Bangalore'},
 'umpires': ['Asad Rauf', 'RE Koertzen'],
 'venue': 'M Chinnaswamy Stadium'}

In [4]:
def read_match_info(fil):
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    win_margin    = list(data['info']['outcome']['by'].values())[0]
    win_type      = list(data['info']['outcome']['by'].keys())[0]
    toss_winner   = data['info']['toss'].get('winner', None)
    toss_decision = data['info']['toss'].get('decision', None)
    
    batting_1st_team = list(data['innings'][0].items())[0][1]['team']
    
    print (teams[0] ,' vs ', teams[1], match_date)
    print ('------------------------------------------------')
    print ( 'Toss \t\t\t', toss_winner, 'Decided to', toss_decision)
    print ('Team Batting First\t', batting_1st_team)
    print ('Result \t\t\t', winner, 'won by ', win_margin, win_type )
    print ('Player of the Match\t', data['info']['player_of_match'][0])



In [5]:
def print_scorecard(f, data_dir='./'):
    fil=os.path.join(data_dir, f )
    
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    toss_winner = data['info']['toss'].get('winner', None)
    
    def convert_to_ov(balls):
        ov, balls_ = (balls//6, balls%6)
        if balls_==0:
            return str(ov)
        else:
            return str(ov)+'.'+str(balls_)

    
    def add_batsman(player, batting_scorecard, season=season, Team=' ', Against=' '):
        if player not in batting_scorecard:
            batting_scorecard[player] = {'0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                         'Runs':0, 'BF':0, 'NO':True, 'Team':Team, 'Against':Against,
                                         'Win':False, 'Toss':False, 'team-total':0, 'season':season}
            
    def add_bowler(player,   bowling_scorecard, season=season, Team=' ', Against=' '):
        if player not in bowling_scorecard:
            bowling_scorecard[player] = {'O':0, 'M':0, 'R':0, 'W':0,
                                         '0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                         'WD':0, 'NB':0, 'Team':Team, 'Against':Against, 'Win':False,
                                         'Toss':False, 'team-total':0, 'season':season}

    batting_card = {}
    bowling_card = {}
    
    for i, inn in enumerate(data['innings']):
        inn_name     = list(inn.keys())[0]

        batting_team = data['innings'][i][inn_name]['team']
        bowling_team = [team for team in  teams if team!=batting_team][0]

        batting_card_inn={}
        bowling_card_inn={}
        
        runs_extra = 0
        runs_total = 0
        wkts       = 0
        
        for delivery in data['innings'][i][inn_name]['deliveries']:
            deliv    = list(delivery.items())[0]

            ball     = deliv[0]
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            add_batsman(batsman, batting_card_inn)
            add_bowler(bowler , bowling_card_inn)

            # counter for each runs (1, 2, 3, 4, 5, 6)
            batting_card_inn[batsman][str(runs_bat)+'s'] += 1
            bowling_card_inn[bowler ][str(runs_bat)+'s'] += 1
            
            # counter for total batsman run
            batting_card_inn[batsman]['Runs']        += runs_bat
            batting_card_inn[batsman]['BF']          += 1
            
            bowling_card_inn[bowler ]['O']           += 1
            bowling_card_inn[bowler ]['R']           += runs_tot

            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['BF'] -= 1 # remove the ball from batsman's account
                    bowling_card_inn[bowler]['O']   -= 1 # to count the extra ball
                    bowling_card_inn[bowler]['WD']  += 1 #
                    
                elif 'noballs' in deliv[1]['extras']:
                    bowling_card_inn[bowler]['O']   -= 1 # to count the extra ball
                    bowling_card_inn[bowler]['NB']  += 1 #
                
                elif 'legbyes' in deliv[1]['extras']:
                    bowling_card_inn[bowler]['R']   -= 1 #

                elif 'byes' in deliv[1]['extras']:
                    bowling_card_inn[bowler]['R']   -= deliv[1]['extras']['byes']

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out = deliv[1]['wicket']['player_out']
                
                # for case when player is runout without facing a ball
                add_batsman(player_out, batting_card_inn, Team=batting_team, Against=bowling_team)
                batting_card_inn[player_out]['NO'] = False
                
                # add wicket to bowler only if it's not RUN OUT
                if deliv[1]['wicket']['kind'] != 'run out':
                    bowling_card_inn[bowler ]['W']           += 1

            batting_card_inn[batsman]['Team']    = batting_team
            batting_card_inn[batsman]['Against'] = bowling_team
            
            bowling_card_inn[bowler ]['Team']    = bowling_team
            bowling_card_inn[bowler ]['Against'] = batting_team
            
            if batting_team == winner: 
                batting_card_inn[batsman]['Win'] = True
            else:
                bowling_card_inn[bowler ]['Win'] = True
                
            if batting_team == toss_winner:
                batting_card_inn[batsman]['Toss'] = True
            else:
                bowling_card_inn[bowler]['Toss'] = True

        for b in batting_card_inn.keys():
            batting_card_inn[b]['team-total'] = runs_total

        for b in bowling_card_inn.keys():
            bowling_card_inn[b]['team-total'] = runs_total
            bowling_card_inn[b]['O'] = convert_to_ov(bowling_card_inn[b]['O'])

        batting_card.update(batting_card_inn)
        bowling_card.update(bowling_card_inn)
    
    df_bat = pd.DataFrame(batting_card).transpose()
    
    df_bat.reset_index(inplace=True)
    df_bat.rename(columns={"index": "batsman"}, inplace=True)
    df_bat['date']=match_date
    df_bat['match-id']=f.split('/')[-1].split('.')[0]
    
    df_bowl = pd.DataFrame(bowling_card).transpose()
    
    df_bowl.reset_index(inplace=True)
    df_bowl.rename(columns={"index": "bowler"}, inplace=True)
    df_bowl['date']=match_date
    df_bowl['match-id']=f.split('/')[-1].split('.')[0]
    
    read_match_info(fil)

    return (df_bat, df_bowl)

f2='../datasets/ipl/yaml/1178406.yaml'
(df_bat, df_bowl) = print_scorecard(f2)

#1178406

Mumbai Indians  vs  Royal Challengers Bangalore 2019-04-15
------------------------------------------------
Toss 			 Mumbai Indians Decided to field
Team Batting First	 Royal Challengers Bangalore
Result 			 Mumbai Indians won by  5 wickets
Player of the Match	 SL Malinga


1178406

In [6]:
display( df_bat.head(2) )
display( df_bowl.head(2) )

Unnamed: 0,batsman,0s,1s,2s,3s,4s,5s,6s,Runs,BF,NO,Team,Against,Win,Toss,team-total,season,date,match-id
0,PA Patel,9,6,0,0,4,0,1,28,20,False,Royal Challengers Bangalore,Mumbai Indians,False,False,171,2019,2019-04-15,1178406
1,V Kohli,5,4,0,0,1,0,0,8,9,False,Royal Challengers Bangalore,Mumbai Indians,False,False,171,2019,2019-04-15,1178406


Unnamed: 0,bowler,O,M,R,W,0s,1s,2s,3s,4s,...,WD,NB,Team,Against,Win,Toss,team-total,season,date,match-id
0,JP Behrendorff,4,0,49,1,11,4,1,0,6,...,1,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
1,SL Malinga,4,0,31,4,8,13,0,0,0,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406


## Random Test of the scorecard

In [7]:
def nice_scorecard(f, data_dir='./'):
    df_bat, df_bowl = print_scorecard(f, data_dir=data_dir)
    df_bat_nice  = df_bat[['batsman', 'Runs', 'BF', '4s', '6s', 'team-total']]
    df_bowl_nice = df_bowl[['bowler', 'O', 'M', 'R', 'W', 'WD', 'NB']]
    display(df_bat_nice)
    display(df_bowl_nice)
    #return df_bat_nice, df_bowl_nice

test = '392220'  # correct
test = '829805'  # correct
test = '981017'  # correct
test = '1082637' # correct
test = '419155'  # correct
test = '1082607' # correct
test = '1178406'

nice_scorecard(test+'.yaml', data_dir=data_dir)

Mumbai Indians  vs  Royal Challengers Bangalore 2019-04-15
------------------------------------------------
Toss 			 Mumbai Indians Decided to field
Team Batting First	 Royal Challengers Bangalore
Result 			 Mumbai Indians won by  5 wickets
Player of the Match	 SL Malinga


Unnamed: 0,batsman,Runs,BF,4s,6s,team-total
0,PA Patel,28,20,4,1,171
1,V Kohli,8,9,1,0,171
2,AB de Villiers,75,51,6,4,171
3,MM Ali,50,32,1,5,171
4,MP Stoinis,0,2,0,0,171
5,AD Nath,2,3,0,0,171
6,P Negi,0,2,0,0,171
7,UT Yadav,0,1,0,0,171
8,Q de Kock,40,26,5,2,172
9,RG Sharma,28,19,2,2,172


Unnamed: 0,bowler,O,M,R,W,WD,NB
0,JP Behrendorff,4,0,49,1,1,0
1,SL Malinga,4,0,31,4,0,0
2,JJ Bumrah,4,0,22,0,0,0
3,HH Pandya,3,0,21,1,0,0
4,RD Chahar,4,0,31,0,0,0
5,KH Pandya,1,0,10,0,0,0
6,UT Yadav,2,0,25,0,1,0
7,NA Saini,3,0,34,0,1,0
8,Mohammed Siraj,2,0,21,1,2,0
9,YS Chahal,4,0,27,2,0,0


## Testing DF

In [16]:
d1 = ['a', 'b', 'c', 'd', 'e']
d2 = [10, 10, 15, 25, 25]
d3 = [0, 5, 10, 15, 20]
d4 = [13, 7, 9, 21, 2]

dff=pd.DataFrame({'d1':d1, 'd2':d2, 'd3':d3, 'd4':d4})
dfs =dff.sort_values(by=['d2', 'd4'], ascending=False)
dfs

#df_sorted  = df_summary.sort_values(by=[sort_by], ascending=False)

Unnamed: 0,d1,d2,d3,d4
3,d,25,15,21
4,e,25,20,2
2,c,15,10,9
0,a,10,0,13
1,b,10,5,7


In [12]:
dff['d1'].values

array(['a', 'b', 'c', 'd', 'e'], dtype=object)

In [9]:
df_bowl

Unnamed: 0,bowler,O,M,R,W,0s,1s,2s,3s,4s,...,WD,NB,Team,Against,Win,Toss,team-total,season,date,match-id
0,JP Behrendorff,4,0,49,1,11,4,1,0,6,...,1,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
1,SL Malinga,4,0,31,4,8,13,0,0,0,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
2,JJ Bumrah,4,0,22,0,14,5,1,1,3,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
3,HH Pandya,3,0,21,1,5,11,0,0,1,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
4,RD Chahar,4,0,31,0,9,11,0,0,2,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
5,KH Pandya,1,0,10,0,1,4,0,0,0,...,0,0,Mumbai Indians,Royal Challengers Bangalore,True,True,171,2019,2019-04-15,1178406
6,UT Yadav,2,0,25,0,7,0,1,0,4,...,1,0,Royal Challengers Bangalore,Mumbai Indians,False,False,172,2019,2019-04-15,1178406
7,NA Saini,3,0,34,0,6,7,1,0,3,...,1,0,Royal Challengers Bangalore,Mumbai Indians,False,False,172,2019,2019-04-15,1178406
8,Mohammed Siraj,2,0,21,1,6,5,0,0,2,...,2,0,Royal Challengers Bangalore,Mumbai Indians,False,False,172,2019,2019-04-15,1178406
9,YS Chahal,4,0,27,2,13,5,2,0,3,...,0,0,Royal Challengers Bangalore,Mumbai Indians,False,False,172,2019,2019-04-15,1178406


In [10]:
round(4.0, 1)

4.0