# Data Extraction  

- The actual scorecard for the match below can be viewed at [Cricinfo webpage.](https://www.espncricinfo.com/series/8048/scorecard/335982/royal-challengers-bangalore-vs-kolkata-knight-riders-1st-match-indian-premier-league-2007-08)

- The ball-by-ball data can be downloaded from (https://cricsheet.org/downloads/).



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml

from utils import get_match_list

data_dir='../datasets/test_matches/tests_male/'

In [2]:
df_all = get_match_list(data_dir=data_dir)
df_all.tail()

Unnamed: 0,date,match_id,team1,team2
619,2020-08-13,1198242,Pakistan,England
620,2020-08-21,1198243,England,Pakistan
621,2020-12-03,1233957,New Zealand,West Indies
622,2020-12-11,1233958,New Zealand,West Indies
623,2020-12-17,1223869,India,Australia


In [3]:
f = '1223869.yaml' # latest game
#f = '1198242.yaml' # drawn game

fil=os.path.join(data_dir,  f )
data=yaml.load(open(fil))

In [4]:
print ( data.keys() )
print ()
print ( data['meta'] )
print ()
print ( 'INFO:\n',data['info'] )

dict_keys(['meta', 'info', 'innings'])

{'data_version': 0.9, 'created': '2020-12-22', 'revision': 1}

INFO:
 {'city': 'Adelaide', 'dates': ['2020-12-17', '2020-12-18', '2020-12-19'], 'gender': 'male', 'match_type': 'Test', 'match_type_number': 2396, 'outcome': {'winner': 'Australia', 'by': {'wickets': 8}}, 'player_of_match': ['TD Paine'], 'teams': ['India', 'Australia'], 'toss': {'decision': 'bat', 'winner': 'India'}, 'umpires': ['BNJ Oxenford', 'PR Reiffel'], 'venue': 'Adelaide Oval'}


In [5]:
def read_match_info(fil):
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    match_type = data['info']['match_type']
    toss_winner   = data['info']['toss'].get('winner', None)
    toss_decision = data['info']['toss'].get('decision', None)
    batting_1st_team = list(data['innings'][0].items())[0][1]['team']
    player_of_match  = data['info'].get('player_of_match', [None])[0]

    print ('-----------------------------------------------------')
    print (teams[0] ,' vs ', teams[1], '\t', match_type, 'match on \t', match_date)
    print ( 'Toss \t\t\t', toss_winner, ', Decided to', toss_decision)
    print ('Team Batting First\t', batting_1st_team)
    print ('Player of the Match\t', player_of_match )

    
    if data['info']['outcome'].get('result', None) == 'tie':
        winner = data['info']['outcome']['eliminator']
        print ('Result \t\t\t', 'Match tied.', winner, 'won the one over eliminator.')
        
    elif data['info']['outcome'].get('result', None) == 'no result':
        print ('Result \t\t\t No Results.')

    elif data['info']['outcome'].get('result', None) == 'draw':
        print ('Result \t\t\t Match Drawn.')
        
    else:
        winner = data['info']['outcome'].get('winner', None)
        win_margin    = list(data['info']['outcome']['by'].values())[0]
        win_type      = list(data['info']['outcome']['by'].keys())[0]
        print ('Result \t\t\t', winner, 'won by ', win_margin, win_type )        
        
    print ('-----------------------------------------------------')

f = '1223869.yaml' # latest game
read_match_info(data_dir+f)

-----------------------------------------------------
India  vs  Australia 	 Test match on 	 2020-12-17
Toss 			 India , Decided to bat
Team Batting First	 India
Player of the Match	 TD Paine
Result 			 Australia won by  8 wickets
-----------------------------------------------------


In [6]:
data['innings']

[{'1st innings': {'team': 'India',
   'deliveries': [{0.1: {'non_striker': 'MA Agarwal',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 0, 'batsman': 0},
      'batsman': 'PP Shaw'}},
    {0.2: {'non_striker': 'MA Agarwal',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 0, 'batsman': 0},
      'batsman': 'PP Shaw',
      'wicket': {'player_out': 'PP Shaw', 'kind': 'bowled'}}},
    {0.3: {'non_striker': 'MA Agarwal',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 0, 'batsman': 0},
      'batsman': 'CA Pujara'}},
    {0.4: {'non_striker': 'MA Agarwal',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 1, 'batsman': 1},
      'batsman': 'CA Pujara'}},
    {0.5: {'non_striker': 'CA Pujara',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 0, 'batsman': 0},
      'batsman': 'MA Agarwal'}},
    {0.6: {'non_striker': 'CA Pujara',
      'bowler': 'MA Starc',
      'runs': {'extras': 0, 'total': 0, 'batsman': 0

In [7]:
num_inns = len(data['innings'])
inns = {list(data['innings'][i].keys())[0]: list(data['innings'][i].values())[0]['team']
        for i in range(num_inns) }

teams  = data['info']['teams']
print (inns, teams)

for i, k in enumerate(inns):
    batting_team = inns[k]
    bowling_team = [team for team in  teams if team!=batting_team][0]
    #print (i, k, batting_team, bowling_team)
    for delivery in data['innings'][i][k]['deliveries']:
        deliv    = list(delivery.items())[0]
        print (deliv)



{'1st innings': 'India', '2nd innings': 'Australia', '3rd innings': 'India', '4th innings': 'Australia'} ['India', 'Australia']
(0.1, {'non_striker': 'MA Agarwal', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 0, 'batsman': 0}, 'batsman': 'PP Shaw'})
(0.2, {'non_striker': 'MA Agarwal', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 0, 'batsman': 0}, 'batsman': 'PP Shaw', 'wicket': {'player_out': 'PP Shaw', 'kind': 'bowled'}})
(0.3, {'non_striker': 'MA Agarwal', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 0, 'batsman': 0}, 'batsman': 'CA Pujara'})
(0.4, {'non_striker': 'MA Agarwal', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 1, 'batsman': 1}, 'batsman': 'CA Pujara'})
(0.5, {'non_striker': 'CA Pujara', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 0, 'batsman': 0}, 'batsman': 'MA Agarwal'})
(0.6, {'non_striker': 'CA Pujara', 'bowler': 'MA Starc', 'runs': {'extras': 0, 'total': 0, 'batsman': 0}, 'batsman': 'MA Agarwal'})
(1.1, {'non_striker': 'MA Aga

In [8]:
def print_scorecard(f, data_dir='./'):
    fil=os.path.join(data_dir, f )
    match_id = f.split('/')[-1].split('.')[0]
    
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    num_inns = len(data['innings'])
    innings  = {list(data['innings'][i].keys())[0]: list(data['innings'][i].values())[0]['team']
                for i in range(num_inns) }
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    draw   = False
    if winner is None:
        draw = True
        
    toss_winner = data['info']['toss'].get('winner', None)

    
    def convert_to_ov(balls):
        ov, balls_ = (balls//6, balls%6)
        if balls_==0:
            return str(ov)
        else:
            return str(ov)+'.'+str(balls_)
    
    def add_batsman(player, inn, batting_scorecard, match_date=match_date, draw=draw, Team=' ', Against=' '):
        if player in batting_scorecard:
            if batting_scorecard[player]['inn'] != inn:
                print (player, '\t', inn, 'adding batsman for other innings')

                batting_scorecard[player] = {'0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                             'R':0, 'BF':0, 'NO':True, 'Team':Team, 'Against':Against, 'Draw':draw,
                                             'Win':False, 'Toss':False, 'inn':inn, 'inn-total':0, 'date':match_date}
        else:
            print (player, '\t', inn, 'adding batsman for first time')
            batting_scorecard[player] = {'0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                         'R':0, 'BF':0, 'NO':True, 'Team':Team, 'Against':Against, 'Draw':draw,
                                         'Win':False, 'Toss':False, 'inn':inn, 'inn-total':0, 'date':match_date}

            
    def add_bowler(player, inn, bowling_scorecard, match_date=match_date, draw=draw, Team=' ', Against=' '):
        
        if player in bowling_scorecard:
            #print (inn, bowling_scorecard[player]['inn'])
            if bowling_scorecard[player]['inn'] != inn:
                print (player, '\t', inn, 'adding bowler for other innings')
                bowling_scorecard[player] = {'O':0, 'M':0, 'R':0, 'W':0, 'ovs':{},
                                             '0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                             'WD':0, 'NB':0, 'Team':Team, 'Against':Against, 'Win':False, 'Draw':draw,
                                             'Toss':False, 'inn':inn, 'inn-total':0, 'date':match_date}
            
            
        else:
            print (player, '\t', inn, 'adding bowler for first time')
            bowling_scorecard[player] = {'O':0, 'M':0, 'R':0, 'W':0, 'ovs':{},
                                         '0s':0, '1s':0, '2s':0, '3s':0, '4s':0, '5s':0, '6s':0,
                                         'WD':0, 'NB':0, 'Team':Team, 'Against':Against, 'Win':False, 'Draw':draw,
                                         'Toss':False, 'inn':inn, 'inn-total':0, 'date':match_date}

    batting_card = {}
    bowling_card = {}
    
    for i, inn in enumerate(inns):
        batting_team = inns[inn]
        bowling_team = [team for team in  teams if team!=batting_team][0]

        batting_card_inn={}
        bowling_card_inn={}
        
        runs_extra = 0
        runs_total = 0
        wkts       = 0
        
        for delivery in data['innings'][i][inn]['deliveries']:
            deliv    = list(delivery.items())[0]
            ball     = deliv[0]
            ov_num   = int(str(ball+1).split('.')[0])
            
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            add_batsman(batsman, inn, batting_card_inn)
            add_bowler(  bowler, inn, bowling_card_inn)

            # counter for each runs (1, 2, 3, 4, 5, 6)
            batting_card_inn[batsman][str(runs_bat)+'s'] += 1
            bowling_card_inn[bowler ][str(runs_bat)+'s'] += 1

            # first check if the over exists on the dictionary
            if not bowling_card_inn[bowler ]['ovs'].get(ov_num, False):
                bowling_card_inn[bowler ]['ovs'][ov_num] = {'R':0, 'W':0}
            bowling_card_inn[bowler ]['ovs'][ov_num]['R'] += runs_tot
            
            # counter for total batsman run
            batting_card_inn[batsman]['R']           += runs_bat
            batting_card_inn[batsman]['BF']          += 1
            
            bowling_card_inn[bowler ]['R']           += runs_tot
            bowling_card_inn[bowler ]['O']           += 1

            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['BF'] -= 1 # remove the ball from batsman's account
                    bowling_card_inn[bowler]['O']   -= 1 # to count the extra ball
                    bowling_card_inn[bowler]['WD']  += 1 #
                    
                elif 'noballs' in deliv[1]['extras']:
                    bowling_card_inn[bowler]['O']   -= 1 # to count the extra ball
                    bowling_card_inn[bowler]['NB']  += 1 #
                
                elif 'legbyes' in deliv[1]['extras']:
                    #bowling_card_inn[bowler]['R']   -= 1 #
                    bowling_card_inn[bowler]['R']   -= deliv[1]['extras']['legbyes']
                    bowling_card_inn[bowler ]['ovs'][ov_num]['R'] -= deliv[1]['extras']['legbyes']

                elif 'byes' in deliv[1]['extras']:
                    bowling_card_inn[bowler]['R']   -= deliv[1]['extras']['byes']

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out = deliv[1]['wicket']['player_out']
                
                # for case when player is runout without facing a ball
                #add_batsman(player_out, batting_card_inn, Team=batting_team, Against=bowling_team)
                add_batsman(player_out, k, batting_card_inn)
                batting_card_inn[player_out]['NO'] = False
                
                # add wicket to bowler only if it's not RUN OUT
                if deliv[1]['wicket']['kind'] != 'run out':
                    bowling_card_inn[bowler ]['W']                += 1
                    bowling_card_inn[bowler ]['ovs'][ov_num]['W'] += 1

            batting_card_inn[batsman]['Team']    = batting_team
            batting_card_inn[batsman]['Against'] = bowling_team
            
            bowling_card_inn[bowler ]['Team']    = bowling_team
            bowling_card_inn[bowler ]['Against'] = batting_team
            
            if batting_team == winner: 
                batting_card_inn[batsman]['Win'] = True
            else:
                bowling_card_inn[bowler ]['Win'] = True
                
            if batting_team == toss_winner:
                batting_card_inn[batsman]['Toss'] = True
            else:
                bowling_card_inn[bowler]['Toss'] = True

        #for b in batting_card_inn.keys():
        #    batting_card_inn[b]['team-total'] = runs_total

        #for b in bowling_card_inn.keys():
        #    bowling_card_inn[b]['team-total'] = runs_total
        #    bowling_card_inn[b]['O'] = convert_to_ov(bowling_card_inn[b]['O'])
        
        batting_card.update(batting_card_inn)
        bowling_card.update(bowling_card_inn)
    
    df_bat = pd.DataFrame(batting_card).transpose()
    df_bat.reset_index(inplace=True)
    df_bat.rename(columns={"index": "batsman"}, inplace=True)
    df_bat['match-id']= match_id
    
    df_bowl = pd.DataFrame(bowling_card).transpose()    
    df_bowl.reset_index(inplace=True)
    df_bowl.rename(columns={"index": "bowler"}, inplace=True)
    df_bowl['match-id']=f.split('/')[-1].split('.')[0]
    
    read_match_info(fil)

    return (df_bat, df_bowl)

f = data_dir+'1223869.yaml'
(df_bat, df_bowl) = print_scorecard(f)

PP Shaw 	 1st innings adding batsman for first time
MA Starc 	 1st innings adding bowler for first time
PP Shaw 	 4th innings adding batsman for other innings
CA Pujara 	 1st innings adding batsman for first time
MA Agarwal 	 1st innings adding batsman for first time
JR Hazlewood 	 1st innings adding bowler for first time
PJ Cummins 	 1st innings adding bowler for first time
MA Agarwal 	 4th innings adding batsman for other innings
V Kohli 	 1st innings adding batsman for first time
C Green 	 1st innings adding bowler for first time
NM Lyon 	 1st innings adding bowler for first time
CA Pujara 	 4th innings adding batsman for other innings
AM Rahane 	 1st innings adding batsman for first time
M Labuschagne 	 1st innings adding bowler for first time
V Kohli 	 4th innings adding batsman for other innings
GH Vihari 	 1st innings adding batsman for first time
AM Rahane 	 4th innings adding batsman for other innings
WP Saha 	 1st innings adding batsman for first time
GH Vihari 	 4th innings 

In [9]:
#df_bat[df_bat['Team']=='India']
df_bat[df_bat['Team']=='Australia']

Unnamed: 0,batsman,0s,1s,2s,3s,4s,5s,6s,R,BF,NO,Team,Against,Draw,Win,Toss,inn,inn-total,date,match-id
11,MS Wade,39,6,2,1,5,0,0,33,53,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
12,JA Burns,44,5,6,0,7,0,1,51,63,True,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
13,M Labuschagne,6,2,2,0,0,0,0,6,10,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
14,SPD Smith,0,1,0,0,0,0,0,1,1,True,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
15,TM Head,0,0,0,0,0,0,0,0,0,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
16,C Green,0,0,0,0,0,0,0,0,0,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
17,TD Paine,64,18,6,1,10,0,0,73,99,True,Australia,India,False,True,False,2nd innings,0,2020-12-17,1223869
18,PJ Cummins,0,0,0,0,0,0,0,0,0,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
19,MA Starc,0,0,0,0,0,0,0,0,0,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
20,NM Lyon,0,0,0,0,0,0,0,0,0,False,Australia,India,False,True,False,4th innings,0,2020-12-17,1223869
