# Data Extraction  

- The actual scorecard for the match below can be viewed at [Cricinfo webpage.](https://www.espncricinfo.com/series/8048/scorecard/335982/royal-challengers-bangalore-vs-kolkata-knight-riders-1st-match-indian-premier-league-2007-08)

- The ball-by-ball data can be downloaded from (https://cricsheet.org/downloads/).



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml
import datetime

data_dir='../datasets/ipl/yaml/'

In [2]:
def get_match_list(year=None, fil='../datasets/ipl/README.txt'):
    f_in = open(fil, 'r')
    lines= f_in.readlines()
    matches = []
    for line in lines[::-1]:  # reversed to get the games sorted with date
        if (len(line)>2 and line[:2]=='20'):
            line_  = line.strip('\n').split(' - ')
            
            date = line_[0]
            match_id = line_[4]
            match_teams = line_[5].strip().split('vs')
            matches.append([date, match_id, match_teams[0].strip(), match_teams[1].strip()])
            
    df_matches = pd.DataFrame(matches, columns=['date', 'match_id', 'team1', 'team2'])
    df_matches['date'] = pd.to_datetime(df_matches['date'])

    if year:
        df_year = df_matches[(df_matches['date']>=datetime.datetime(year, 1,   1)) & 
                         (df_matches['date']<=datetime.datetime(year, 12, 31))]
        return df_year
    
    return df_matches

In [26]:
dfm=get_match_list(2011)
list(dfm.match_id)
#dfm[ (dfm['team2']=='Royal Challengers Bangalore') & (dfm['team1']=='Rajasthan Royals') ]

#Rajasthan Royals

['501198',
 '501200',
 '501199',
 '501202',
 '501201',
 '501203',
 '501205',
 '501204',
 '501207',
 '501206',
 '501208',
 '501210',
 '501209',
 '501212',
 '501211',
 '501214',
 '501213',
 '501215',
 '501216',
 '501219',
 '501218',
 '501220',
 '501222',
 '501221',
 '501223',
 '501225',
 '501224',
 '501226',
 '501227',
 '501229',
 '501228',
 '501230',
 '501232',
 '501231',
 '501234',
 '501233',
 '501236',
 '501235',
 '501238',
 '501237',
 '501239',
 '501241',
 '501240',
 '501243',
 '501242',
 '501244',
 '501246',
 '501245',
 '501248',
 '501247',
 '501249',
 '501251',
 '501250',
 '501252',
 '501253',
 '501254',
 '501256',
 '501255',
 '501258',
 '501257',
 '501259',
 '501260',
 '501261',
 '501262',
 '501263',
 '501265',
 '501264',
 '501267',
 '501266',
 '501268',
 '501269',
 '501270',
 '501271']

In [76]:
def print_scorecard(f, data_dir='./'):
    
    def add_player(player, scorecard):
        if player not in scorecard:
            scorecard[player] = {'0':0, '1':0, '2':0, '3':0, '4':0, '5':0, '6':0, 'Runs':0, 'BF':0,
                                 'NO':True, 'Team':' ', 'Against':' ', 'Win':False, 'Toss':False,
                                 'team-total':0}

    fil=os.path.join(data_dir, f )
    
    with open(fil, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    try:
        match_date=data['info']['dates']
    except:
        match_date=pd.to_datetime(data['info']['dates'])
        
    teams  = data['info']['teams']
    winner = data['info']['outcome'].get('winner', None)
    toss_winner = data['info']['toss'].get('winner', None)

    batting_card = {}
    
    for i, inn in enumerate(data['innings']):
        inn_name=list(inn.keys())[0]

        batting_team=data['innings'][i][inn_name]['team']
        bowling_team = [team for team in  teams if team!=batting_team][0]

        batting_card_inn={}
        runs_bowler={}
        runs_extra = 0
        runs_total = 0
        wkts       = 0
        
        for delivery in data['innings'][i][inn_name]['deliveries']:
            deliv    = list(delivery.items())[0]
    
    #     for inn in range(2):
    #         inn_name = list(data['innings'][inn].keys())[0].strip()

    #         batting_team = list(data['innings'][inn].values())[0]['team']
    #         bowling_team = [team for team in  teams if team!=batting_team][0]

    #         batting_card_inn={}
    #         runs_bowler={}
    #         runs_extra = 0
    #         runs_total = 0
    #         wkts       = 0

    #         for delivery in list(data['innings'][inn].values())[0]['deliveries']:
    #             deliv = list(delivery.items())[0]

            ball     = deliv[0]
            batsman  = deliv[1]['batsman'].strip()
            bowler   = deliv[1]['bowler'].strip()
            runs_bat = deliv[1]['runs'].get('batsman', 0)
            runs_ext = deliv[1]['runs'].get('extras',  0)
            runs_tot = deliv[1]['runs'].get('total',   0)
            
            add_player(batsman, batting_card_inn)

            # counter for each runs (1, 2, 3, 4, 5, 6)
            batting_card_inn[batsman][str(runs_bat)] += 1
            
            # counter for total batsman run
            batting_card_inn[batsman]['Runs']           += runs_bat
            
            # counter for toal balls faced [ball will be removed later if it's a wide]
            batting_card_inn[batsman]['BF']           += 1
            
            if 'extras' in deliv[1]:
                if 'wides' in deliv[1]['extras']:
                    batting_card_inn[batsman]['BF'] -= 1 # remove the ball from batsman's account

            runs_extra += runs_ext
            runs_total += runs_tot
            
            if deliv[1].get('wicket', None):
                wkts += 1
                player_out=deliv[1]['wicket']['player_out']
                
                # for case when player is runout without facing a ball
                add_player(player_out, batting_card_inn)                
                batting_card_inn[player_out]['NO']=False
                
            batting_card_inn[batsman]['Team'] = batting_team
            batting_card_inn[batsman]['Against'] = bowling_team
            
            if batting_team==winner:
                batting_card_inn[batsman]['Win'] = True
                
            if batting_team==toss_winner:
                batting_card_inn[batsman]['Toss'] = True
            
        for b in batting_card_inn.keys():
            batting_card_inn[b]['team-total'] = runs_total

        batting_card.update(batting_card_inn)

    df=pd.DataFrame(batting_card).T
    df.reset_index(inplace=True)
    df.rename(columns={"index": "batsman"}, inplace=True)
    df['date']=match_date[0]
    df['match-id']=f.split('.')[0]
    return df


In [78]:
match_id='335982'
match_id='501265'
df=print_scorecard(match_id+'.yaml', data_dir=data_dir)
df

Unnamed: 0,batsman,0,1,2,3,4,5,6,Runs,BF,NO,Team,Against,Win,Toss,team-total,date,match-id
0,NV Ojha,15,4,0,0,1,0,0,8,19,False,Delhi Daredevils,Pune Warriors,False,True,56,2011-05-21,501265
1,DA Warner,4,3,0,0,1,0,2,19,10,False,Delhi Daredevils,Pune Warriors,False,True,56,2011-05-21,501265
2,MS Wade,11,7,2,0,0,0,0,11,20,False,Delhi Daredevils,Pune Warriors,False,True,56,2011-05-21,501265
3,CA Ingram,3,7,1,0,1,0,0,13,12,True,Delhi Daredevils,Pune Warriors,False,True,56,2011-05-21,501265
4,Y Venugopal Rao,1,0,0,0,0,0,0,0,1,True,Delhi Daredevils,Pune Warriors,False,True,56,2011-05-21,501265


In [75]:
#match_id='501265'  # faulty
match_id='501264'  

fil=os.path.join(data_dir,  match_id+'.yaml' )
data=yaml.load(open(fil))
#data['innings']
#data['info']
for i, inn in enumerate(data['innings']):
    inn_name=list(inn.keys())[0]
    print (inn_name)
    for delivery in data['innings'][i][inn_name]['deliveries']:
        deliv    = list(delivery.items())[0]
        ball     = deliv[0]
        
        batting_team=data['innings'][i][inn_name]['team']
        
        print (batting_team)
        
    #inn_name=list(inn.keys())[0]
    #data['innings'][inn_name]

#print_scorecard( match_id+'.yaml', data_dir=data_dir )

#dfm=get_match_list(2011)

#for match_id in list(dfm.match_id)[:20]:
#    print (match_id)
#    print_scorecard( match_id+'.yaml', data_dir=data_dir )

1st innings
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Chargers
Deccan Charg

In [6]:
match_id='335982'
df=print_scorecard(match_id+'.yaml', data_dir=data_dir)
df

Unnamed: 0,batsman,0,1,2,3,4,5,6,Runs,BF,NO,Team,Against,Win,Toss,team-total,date,match-id
0,SC Ganguly,8,2,0,0,2,0,0,10,12,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
1,BB McCullum,23,22,9,0,10,0,13,158,73,True,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
2,RT Ponting,9,8,1,0,1,0,1,20,20,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
3,DJ Hussey,4,6,1,0,1,0,0,12,12,False,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
4,Mohammad Hafeez,1,1,0,0,1,0,0,5,3,True,Kolkata Knight Riders,Royal Challengers Bangalore,True,False,222,2008-04-18,335982
5,R Dravid,1,2,0,0,0,0,0,2,3,False,Royal Challengers Bangalore,Kolkata Knight Riders,False,True,82,2008-04-18,335982
6,W Jaffer,13,4,1,0,0,0,0,6,16,False,Royal Challengers Bangalore,Kolkata Knight Riders,False,True,82,2008-04-18,335982
7,V Kohli,4,1,0,0,0,0,0,1,5,False,Royal Challengers Bangalore,Kolkata Knight Riders,False,True,82,2008-04-18,335982
8,JH Kallis,4,2,0,0,0,0,1,8,7,False,Royal Challengers Bangalore,Kolkata Knight Riders,False,True,82,2008-04-18,335982
9,CL White,5,4,1,0,0,0,0,6,10,False,Royal Challengers Bangalore,Kolkata Knight Riders,False,True,82,2008-04-18,335982


In [18]:
#years=range(2008, 2020, 1)
years=[2011]

df_all=pd.DataFrame({}, columns=['batsman', '0', '1', '2', '3', '4', '5', '6',
                                 'Runs', 'BF', 'NO', 'Team', 'Against', 'Win',
                                 'Toss','team-total', 'date', 'match-id' ])

for year in years:
    df_match=get_match_list(year)
    match_ids = df_match['match_id']

    for match_id in match_ids:
        df=print_scorecard(match_id+'.yaml', data_dir=data_dir)
        df_all=pd.concat([df_all, df], ignore_index=True)

df_all

IndexError: list index out of range

In [None]:
with open('./database/df_all.df', 'wb') as fh:
    pickle.dump(df_all, fh)
    print ('dumped', year)

In [9]:
year=[2008]
with open('./database/database_'+str(year)+'.dat', 'wb') as fh:
    pickle.dump(df_all, fh)
    print ('dumped', year)

NameError: name 'pickle' is not defined

In [None]:
player='SM Pollock'
df_ = df_all[  df_all['batsman']==player ]
df_.R.sum()

In [None]:
import glob
import pickle

def create_database(year):

    fils=glob.glob(data_dir+'/[3-4]*.yaml')
    print ( len(fils) )

    db={'2008': [], '2009': [], '2010': []}
    #db={'2008': [], '2009': [], '2010': []}#, '2011': [],
    #    '2012': [], '2013': [], '2014': [], '2015': [],
    #    '2016': [], '2017': [], '2018': [], '2019': []}

    for fil in fils:
        year, score = print_scorecard(fil)
        db[year].append(score)

    for year in list(db.keys()):
        db_year = db[year]
    
        with open('./database/database_'+year+'.dat', 'wb') as fh:
            pickle.dump(db_year, fh)
            print ('dumped', year)
            
create_database()
