# Data Extraction  

- The actual scorecard for the match below can be viewed at [Cricinfo webpage.](https://www.espncricinfo.com/series/8048/scorecard/335982/royal-challengers-bangalore-vs-kolkata-knight-riders-1st-match-indian-premier-league-2007-08)

- The ball-by-ball data can be downloaded from (https://cricsheet.org/downloads/).



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml

from utils import get_match_list

data_dir='../datasets/ipl/yaml/'

In [2]:
f = '335982.yaml'
fil=os.path.join(data_dir,  f )
data=yaml.load(open(fil))

In [3]:
print ( data.keys() )
print ()
print ( data['meta'] )
print ()
data['info']

dict_keys(['meta', 'info', 'innings'])

{'data_version': 0.9, 'created': datetime.date(2011, 5, 6), 'revision': 2}



{'city': 'Bangalore',
 'competition': 'IPL',
 'dates': [datetime.date(2008, 4, 18)],
 'gender': 'male',
 'match_type': 'T20',
 'outcome': {'by': {'runs': 140}, 'winner': 'Kolkata Knight Riders'},
 'overs': 20,
 'player_of_match': ['BB McCullum'],
 'teams': ['Royal Challengers Bangalore', 'Kolkata Knight Riders'],
 'toss': {'decision': 'field', 'winner': 'Royal Challengers Bangalore'},
 'umpires': ['Asad Rauf', 'RE Koertzen'],
 'venue': 'M Chinnaswamy Stadium'}

In [33]:
def read_match_info(fil):
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    win_margin    = list(data['info']['outcome']['by'].values())[0]
    win_type      = list(data['info']['outcome']['by'].keys())[0]
    toss_winner   = data['info']['toss'].get('winner', None)
    toss_decision = data['info']['toss'].get('decision', None)
    
    batting_1st_team = list(data['innings'][0].items())[0][1]['team']
    
    print (teams[0] ,' vs ', teams[1], ', On ', match_date)
    print ('------------------------------------------------')
    print ( 'Toss \t\t\t', toss_winner, 'Decided to', toss_decision)
    print ('Team Batting First\t', batting_1st_team)
    print ('Result \t\t\t', winner, 'won by ', win_margin, win_type )
    print ('Player of the Match\t', data['info']['player_of_match'][0])



In [34]:
def print_scorecard(f, data_dir='./'):
    fil=os.path.join(data_dir, f )
    
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    
    try:
        match_date=data['info']['dates'][0].strftime('%Y-%m-%d')
    except:
        match_date=data['info']['dates'][0]
    
    season = match_date.split('-')[0]
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    toss_winner = data['info']['toss'].get('winner', None)
    
    
    def add_player(player, scorecard, season=season, Team=' ', Against=' '):
        if player not in scorecard:
            scorecard[player] = {'0':0, '1':0, '2':0, '3':0, '4':0, '5':0, '6':0,
                                 'Runs':0, 'BF':0, 'NO':True, 'Team':Team, 'Against':Against,
                                 'Win':False, 'Toss':False, 'team-total':0, 'season':season}
    batting_card = {}
    
    for i, inn in enumerate(data['innings']):
        inn_name     = list(inn.keys())[0]

        batting_team = data['innings'][i][inn_name]['team']
        bowling_team = [team for team in  teams if team!=batting_team][0]

        batting_card_inn={}
        runs_bowler={}
        runs_extra = 0
        runs_total = 0
        wkts       = 0
        
        for delivery in data['innings'][i][inn_name]['deliveries']:
            deliv    = list(delivery.items())[0]

            ball     = deliv[0]
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            add_player(batsman, batting_card_inn)

            # counter for each runs (1, 2, 3, 4, 5, 6)
            batting_card_inn[batsman][str(runs_bat)] += 1
            
            # counter for total batsman run
            batting_card_inn[batsman]['Runs']        += runs_bat
            
            # counter for toal balls faced [ball will be removed later if it's a wide]
            batting_card_inn[batsman]['BF']          += 1
            
            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['BF']  -= 1 # remove the ball from batsman's account

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out = deliv[1]['wicket']['player_out']
                
                # for case when player is runout without facing a ball
                add_player(player_out, batting_card_inn, Team=batting_team, Against=bowling_team)
                batting_card_inn[player_out]['NO']=False
                
            batting_card_inn[batsman]['Team']     = batting_team
            batting_card_inn[batsman]['Against']  = bowling_team
            
            if batting_team == winner:
                batting_card_inn[batsman]['Win']  = True
                
            if batting_team == toss_winner:
                batting_card_inn[batsman]['Toss'] = True
            
        for b in batting_card_inn.keys():
            batting_card_inn[b]['team-total']     = runs_total

            
        #batting_card_inn['Total_inn'+str(i+1)]= {'0':' ', '1':' ', '2':' ', '3':' ', '4':' ', '5':' ', '6':' ',
        #                         'Runs':' ', 'BF':' ', 'NO':' ', 'Team':' ', 'Against':' ',
        #                         'Win':' ', 'Toss':' ', 'team-total':str(runs_total)+'-'+str(wkts), 'season':' '}
            
        batting_card.update(batting_card_inn)
    
    df = pd.DataFrame(batting_card).transpose()
    
    df.reset_index(inplace=True)
    df.rename(columns={"index": "batsman"}, inplace=True)
    df['date']=match_date
    df['match-id']=f.split('/')[-1].split('.')[0]
    
    read_match_info(fil)

    return df

f2='../datasets/ipl/yaml/336002.yaml'
print_scorecard(f2)


Deccan Chargers  vs  Royal Challengers Bangalore , On  2008-05-25
------------------------------------------------
Toss 			 Deccan Chargers Decided to bat
Team Batting First	 Deccan Chargers
Result 			 Royal Challengers Bangalore won by  5 wickets
Player of the Match	 R Vinay Kumar


Unnamed: 0,batsman,0,1,2,3,4,5,6,Runs,BF,NO,Team,Against,Win,Toss,team-total,season,date,match-id
0,AC Gilchrist,15,16,1,0,4,0,2,46,37,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
1,HH Gibbs,18,9,1,0,6,0,2,47,34,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
2,RG Sharma,8,5,2,0,2,0,0,17,16,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
3,LPC Silva,2,0,0,0,0,0,0,0,2,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
4,Y Venugopal Rao,2,6,0,0,2,0,2,26,12,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
5,DB Ravi Teja,3,3,0,0,1,0,0,7,7,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
6,AS Yadav,5,2,0,0,0,0,1,8,8,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
7,SB Bangar,0,0,0,0,0,0,0,0,0,False,Deccan Chargers,Royal Challengers Bangalore,False,False,165,2008,2008-05-25,336002
8,WPUJC Vaas,3,0,0,0,0,0,0,0,3,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002
9,RP Singh,2,1,0,0,0,0,0,1,3,False,Deccan Chargers,Royal Challengers Bangalore,False,True,165,2008,2008-05-25,336002


## Random Test of the scorecard

In [46]:
def nice_scorecard(f, data_dir='./'):
    df_ = print_scorecard(f, data_dir=data_dir)
    #df_ = df_.drop(['0', '1', '2', '3', '5', 'Team', 'Against', 'Toss'], axis=1)
    df_nice = df_[['batsman', 'Runs', 'BF', '4', '6', 'team-total']]
    return df_nice
    
    
test = '392220'  # correct
test = '829805'  # correct
test = '981017'  # correct
test = '1082637' # correct
test = '419155'  # correct
test = '1082607' # correct
test = '1178406'

dff_=nice_scorecard(test+'.yaml', data_dir=data_dir)
dff_

Mumbai Indians  vs  Royal Challengers Bangalore , On  2019-04-15
------------------------------------------------
Toss 			 Mumbai Indians Decided to field
Team Batting First	 Royal Challengers Bangalore
Result 			 Mumbai Indians won by  5 wickets
Player of the Match	 SL Malinga


Unnamed: 0,batsman,Runs,BF,4,6,team-total
0,PA Patel,28,20,4,1,171
1,V Kohli,8,9,1,0,171
2,AB de Villiers,75,51,6,4,171
3,MM Ali,50,32,1,5,171
4,MP Stoinis,0,2,0,0,171
5,AD Nath,2,3,0,0,171
6,P Negi,0,2,0,0,171
7,UT Yadav,0,1,0,0,171
8,Q de Kock,40,26,5,2,172
9,RG Sharma,28,19,2,2,172


## Testing DF

In [6]:
d1 = ['a', 'b', 'c', 'd', 'e']
d3 = [0, 5, 10, 15, 20]
d2 = [2, 10, 15, 20, 25]

dff=pd.DataFrame({'d1':d1, 'd2':d2, 'd3':d3})
dff

Unnamed: 0,d1,d2,d3
0,a,2,0
1,b,10,5
2,c,15,10
3,d,20,15
4,e,25,20


In [None]:
#
