# Data Extraction  

- The actual scorecard for the match below can be viewed at [Cricinfo webpage.](https://www.espncricinfo.com/series/8048/scorecard/335982/royal-challengers-bangalore-vs-kolkata-knight-riders-1st-match-indian-premier-league-2007-08)

- The ball-by-ball data can be downloaded from (https://cricsheet.org/downloads/).



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml

data_dir='../datasets/ipl/yaml/'

In [2]:
f = '335982.yaml'
fil=os.path.join(data_dir,  f )
data=yaml.load(open(fil))

In [3]:
print ( data.keys() )
print ()
print ( data['meta'] )
print ()
data['info']

dict_keys(['meta', 'info', 'innings'])

{'data_version': 0.9, 'created': datetime.date(2011, 5, 6), 'revision': 2}



{'city': 'Bangalore',
 'competition': 'IPL',
 'dates': [datetime.date(2008, 4, 18)],
 'gender': 'male',
 'match_type': 'T20',
 'outcome': {'by': {'runs': 140}, 'winner': 'Kolkata Knight Riders'},
 'overs': 20,
 'player_of_match': ['BB McCullum'],
 'teams': ['Royal Challengers Bangalore', 'Kolkata Knight Riders'],
 'toss': {'decision': 'field', 'winner': 'Royal Challengers Bangalore'},
 'umpires': ['Asad Rauf', 'RE Koertzen'],
 'venue': 'M Chinnaswamy Stadium'}

In [4]:
list(data['info']['outcome']['by'].keys())[0]
list(data['info']['outcome']['by'].values())[0]

140

In [50]:
def print_scorecard(f, data_dir='./'):
    fil=os.path.join(data_dir, f )
    
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    toss_winner = data['info']['toss'].get('winner', None)
    
    
    def add_player(player, scorecard, season=season):
        if player not in scorecard:
            scorecard[player] = {'0':0, '1':0, '2':0, '3':0, '4':0, '5':0, '6':0,
                                 'Runs':0, 'BF':0, 'NO':True, 'Team':' ', 'Against':' ',
                                 'Win':False, 'Toss':False, 'team-total':0, 'season':season}
    batting_card = {}
    
    for i, inn in enumerate(data['innings']):
        inn_name     = list(inn.keys())[0]

        batting_team = data['innings'][i][inn_name]['team']
        bowling_team = [team for team in  teams if team!=batting_team][0]

        batting_card_inn={}
        runs_bowler={}
        runs_extra = 0
        runs_total = 0
        wkts       = 0
        
        for delivery in data['innings'][i][inn_name]['deliveries']:
            deliv    = list(delivery.items())[0]

            ball     = deliv[0]
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            add_player(batsman, batting_card_inn)

            # counter for each runs (1, 2, 3, 4, 5, 6)
            batting_card_inn[batsman][str(runs_bat)] += 1
            
            # counter for total batsman run
            batting_card_inn[batsman]['Runs']        += runs_bat
            
            # counter for toal balls faced [ball will be removed later if it's a wide]
            batting_card_inn[batsman]['BF']          += 1
            
            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['BF']  -= 1 # remove the ball from batsman's account

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out = deliv[1]['wicket']['player_out']
                
                # for case when player is runout without facing a ball
                add_player(player_out, batting_card_inn)
                batting_card_inn[player_out]['NO']=False
                
            batting_card_inn[batsman]['Team']     = batting_team
            batting_card_inn[batsman]['Against']  = bowling_team
            
            if batting_team==winner:
                batting_card_inn[batsman]['Win']  = True
                
            if batting_team==toss_winner:
                batting_card_inn[batsman]['Toss'] = True
            
        for b in batting_card_inn.keys():
            batting_card_inn[b]['team-total']     = runs_total

            
        batting_card_inn['Total_inn'+str(i+1)]= {'0':' ', '1':' ', '2':' ', '3':' ', '4':' ', '5':' ', '6':' ',
                                 'Runs':' ', 'BF':' ', 'NO':' ', 'Team':' ', 'Against':' ',
                                 'Win':' ', 'Toss':' ', 'team-total':str(runs_total)+'-'+str(wkts), 'season':' '}
            
        batting_card.update(batting_card_inn)

    df=pd.DataFrame(batting_card).T
    df.reset_index(inplace=True)
    df.rename(columns={"index": "batsman"}, inplace=True)
    df['date']=match_date
    df['match-id']=f.split('/')[-1].split('.')[0]
    return df

f2='../datasets/ipl/yaml/336002.yaml'
print_scorecard(f2)


Unnamed: 0,batsman,0,1,2,3,4,5,6,Runs,BF,NO,Team,Against,Win,Toss,team-total,season,date,match-id
0,AC Gilchrist,15.0,16.0,1.0,0.0,4.0,0.0,2.0,46.0,37.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
1,HH Gibbs,18.0,9.0,1.0,0.0,6.0,0.0,2.0,47.0,34.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
2,RG Sharma,8.0,5.0,2.0,0.0,2.0,0.0,0.0,17.0,16.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
3,LPC Silva,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
4,Y Venugopal Rao,2.0,6.0,0.0,0.0,2.0,0.0,2.0,26.0,12.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
5,DB Ravi Teja,3.0,3.0,0.0,0.0,1.0,0.0,0.0,7.0,7.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
6,AS Yadav,5.0,2.0,0.0,0.0,0.0,0.0,1.0,8.0,8.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
7,SB Bangar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,,False,False,165,2008.0,2008-05-25,336002
8,WPUJC Vaas,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002
9,RP Singh,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008.0,2008-05-25,336002


In [38]:
def print_scorecard(f, data_dir='./'):
    
    fil=os.path.join(data_dir,  f )
    #data=yaml.load(open(fil))
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    # obtain date
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    batting_card = {}    
    for inn in range(2):
        inn_name = list(data['innings'][inn].keys())[0].strip()

        batting_card_inn={}
        batting_card_inn[inn_name] = {'0':' ', '1':' ', '2':' ', '3':' ', '4':' ','5':' ', '6':' ', 'R':' ', 'B':' ', 'NOT-OUT':' '}
        
        runs_bowler={}
        runs_extra = 0
        runs_total = 0
        wkts       = 0

        for delivery in list(data['innings'][inn].values())[0]['deliveries']:
            deliv = list(delivery.items())[0]
            
            
            ball     = deliv[0]
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            if batsman not in batting_card_inn:
                batting_card_inn[batsman] = {'0':0, '1':0, '2':0, '3':0, '4':0, '5':0, '6':0, 'R':0, 'B':0, 'NOT-OUT':True}

            batting_card_inn[batsman][str(runs_bat)] = batting_card_inn[batsman][str(runs_bat)]+1
            batting_card_inn[batsman]['R'] += runs_bat
            batting_card_inn[batsman]['B'] += 1 # remove the ball later if it's a wide
            
            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['B'] -= 1 # remove the ball from batsman's account

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out=deliv[1]['wicket']['player_out']

                # case with run out without facing the ball
                if  player_out not in batting_card_inn:
                    batting_card_inn[player_out] = {'0':0, '1':0, '2':0, '3':0, '4':0, '5':0, '6':0, 'R':0, 'B':0, 'NOT-OUT':True}

                batting_card_inn[player_out]['NOT-OUT']=False

        batting_card_inn['Total_inn'+str(inn+1)] = {'0':' ', '1':' ', '2':' ', '3':' ', '4':' ','5':' ', '6':' ',
                                                    'R':str(runs_total)+'-'+str(wkts), 'B':' ', 'NOT-OUT':' '}
        batting_card_inn['  '] = {'0':' ', '1':' ', '2':' ', '3':' ', '4':' ','5':' ', '6':' ', 'R':' ', 'B':' ', 'NOT-OUT':' '}
        batting_card.update(batting_card_inn)
    
    df = pd.DataFrame(batting_card).transpose()

    print (data['info']['teams'][0] ,' vs ', data['info']['teams'][1], ', ', match_date)
    print ( 'Toss \t\t\t', data['info']['toss']['winner'], 'Decided to', data['info']['toss']['decision'])
    print ('Result \t\t\t', data['info']['outcome']['winner'], 'won by ', 
           list(data['info']['outcome']['by'].values())[0], list(data['info']['outcome']['by'].keys())[0])
    print ('Player of the Match\t', data['info']['player_of_match'][0])

    #display(df)
    
    df_to_return=df.copy().drop(['0', '1', '2', '3'], axis=1)
    df_to_return=df_to_return.drop(['1st innings', '2nd innings', 'Total_inn1', 'Total_inn2', '  '], axis=0)
    display(df_to_return)

    #return df_to_return
    
#f = '335982.yaml'

#df = print_scorecard(f, data_dir=data_dir)



f2='../datasets/ipl/yaml/336002.yaml'
print (f2)
data = yaml.load(open(fil))

print_scorecard(f2)




../datasets/ipl/yaml/336002.yaml
Deccan Chargers  vs  Royal Challengers Bangalore ,  2008-05-25
Toss 			 Deccan Chargers Decided to bat
Result 			 Royal Challengers Bangalore won by  5 wickets
Player of the Match	 R Vinay Kumar


Unnamed: 0,4,5,6,R,B,NOT-OUT
AC Gilchrist,4,0,2,46,37,False
HH Gibbs,6,0,2,47,34,False
RG Sharma,2,0,0,17,16,False
LPC Silva,0,0,0,0,2,False
Y Venugopal Rao,2,0,2,26,12,False
DB Ravi Teja,1,0,0,7,7,False
AS Yadav,0,0,1,8,8,False
SB Bangar,0,0,0,0,0,False
WPUJC Vaas,0,0,0,0,3,False
RP Singh,0,0,0,1,3,False


In [41]:
import pickle
df=pickle.load(open('./database/scorecard_all.df', 'rb'))

In [46]:
df

Unnamed: 0,batsman,0,1,2,3,4,5,6,Runs,BF,NO,Team,Against,Win,Toss,team-total,date,match-id
0,SC Ganguly,8,2,0,0,2,0,0,10,12,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
1,BB McCullum,23,22,9,0,10,0,13,158,73,True,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
2,RT Ponting,9,8,1,0,1,0,1,20,20,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
3,DJ Hussey,4,6,1,0,1,0,0,12,12,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
4,Mohammad Hafeez,1,1,0,0,1,0,0,5,3,True,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11329,AT Rayudu,3,1,0,0,0,0,0,1,4,False,Chennai Super Kings,Mumbai Indians,False,False,148,2019-05-12,1181768
11330,MS Dhoni,6,2,0,0,0,0,0,2,8,False,Chennai Super Kings,Mumbai Indians,False,False,148,2019-05-12,1181768
11331,DJ Bravo,6,7,1,0,0,0,1,15,15,False,Chennai Super Kings,Mumbai Indians,False,False,148,2019-05-12,1181768
11332,RA Jadeja,2,1,2,0,0,0,0,5,5,True,Chennai Super Kings,Mumbai Indians,False,False,148,2019-05-12,1181768
