# Creating Test Match Database

In [1]:
import numpy as np
import pandas as pd
import pickle

from utils import get_match_list
from print_scorecard_test import print_scorecard

data_dir='../datasets/test_matches/tests_male/'
database_dir = '../database/'

In [2]:
df_matches = get_match_list(data_dir=data_dir)
print (df_matches.shape)
df_matches.head()

(627, 4)


Unnamed: 0,date,match_id,team1,team2
0,2005-01-02,64118,South Africa,England
1,2005-03-04,64123,Zimbabwe,South Africa
2,2005-03-10,64128,New Zealand,Australia
3,2005-03-11,64124,Zimbabwe,South Africa
4,2005-03-18,64129,Australia,New Zealand


In [4]:
def create_database_test(year):
    df_matches = get_match_list(year=year, data_dir=data_dir)
    print ('Total # of matches:', df_matches.shape[0])
    col_bat = ['batsman', 'runs', 'BF', 'NO', '0s', '1s', '2s', '3s', '4s', '5s', '6s',
               'innings', 'Team', 'Against', 'date',  'match_id', 'Win', 'Draw', 'Toss',
               'runs_tot_inn', 'wkts_tot_inn']
    
    col_bowl = ['bowler', 'O', 'M', 'R', 'W', 'innings', 'all_overs',
                '0s', '1s', '2s', '3s', '4s', '5s', '6s', 'wds', 'nbs',
                'date', 'match_id', 'Win', 'draw', 'Toss']
    
    df_bat_all   = pd.DataFrame({}, columns=col_bat)
    df_bowl_all  = pd.DataFrame({}, columns=col_bowl)
    
    for match_id in df_matches.match_id:
        print (match_id)
        filname = match_id+'.yaml'
        df_bat, df_bowl = print_scorecard(filname, data_dir=data_dir )
        
        df_bat, df_bowl = print_scorecard(match_id+'.yaml', data_dir=data_dir)
        df_bat_all  = pd.concat([df_bat_all,  df_bat] , ignore_index=True)
        df_bowl_all = pd.concat([df_bowl_all, df_bowl], ignore_index=True)
        
    savename_bat  = database_dir + '/batting_record_test_2020.df'
    savename_bowl = database_dir + '/bowling_record_test_2020.df'
    
    # save the dataframes
    with open(savename_bat, 'wb') as fbat:
        pickle.dump(df_bat_all, fbat)

    with open(savename_bowl, 'wb') as fbowl:
        pickle.dump(df_bowl_all, fbowl)

create_database_test(2020)

Total # of matches: 22
1185305
1183534
1185306
1212541
1185307
1212542
1213062
1187685
1214666
1187686
1225247
1225248
1225249
1198241
1198242
1198243
1233957
1233958
1223869
1237356
1233962
1223870


In [5]:
df_bat  = pickle.load(open( database_dir + '/batting_record_test_2020.df', 'rb'))
df_bowl = pickle.load(open( database_dir + '/bowling_record_test_2020.df', 'rb'))
df_bowl.head()

Unnamed: 0,bowler,O,M,R,W,innings,all_overs,0s,1s,2s,...,5s,6s,wds,nbs,date,match_id,Win,draw,Toss,7s
0,VD Philander,16.0,3,46,2,1st innings,"{1: {'R': 1, 'W': 0}, 3: {'R': 0, 'W': 1}, 5: ...",76,10,2,...,0,0,0,0,2020-01-03,1185305,False,False,False,0.0
1,K Rabada,19.5,3,68,3,1st innings,"{2: {'R': 7, 'W': 0}, 4: {'R': 5, 'W': 0}, 6: ...",93,8,8,...,0,0,0,1,2020-01-03,1185305,False,False,False,0.0
2,A Nortje,18.0,2,56,2,1st innings,"{13: {'R': 2, 'W': 0}, 15: {'R': 1, 'W': 0}, 1...",80,17,4,...,0,0,1,0,2020-01-03,1185305,False,False,False,0.0
3,KA Maharaj,27.0,6,68,1,1st innings,"{14: {'R': 5, 'W': 0}, 16: {'R': 1, 'W': 0}, 2...",127,25,0,...,0,2,0,0,2020-01-03,1185305,False,False,False,0.0
4,D Pretorius,11.0,5,26,2,1st innings,"{18: {'R': 0, 'W': 0}, 20: {'R': 5, 'W': 0}, 2...",57,2,2,...,0,0,0,0,2020-01-03,1185305,False,False,False,0.0


In [7]:
df_bat.head()

Unnamed: 0,batsman,runs,BF,NO,0s,1s,2s,3s,4s,5s,...,Team,Against,date,match_id,Win,Draw,Toss,runs_tot_inn,wkts_tot_inn,7s
0,Z Crawley,4,15,False,13,1,0,1,0,0,...,England,South Africa,2020-01-03,1185305,True,False,True,269,10,0.0
1,DP Sibley,34,76,False,65,2,2,0,7,0,...,England,South Africa,2020-01-03,1185305,True,False,True,269,10,0.0
2,JL Denly,38,130,False,110,12,3,0,5,0,...,England,South Africa,2020-01-03,1185305,True,False,True,269,10,0.0
3,JE Root,35,49,False,32,10,1,1,5,0,...,England,South Africa,2020-01-03,1185305,True,False,True,269,10,0.0
4,BA Stokes,47,77,False,54,15,1,0,6,0,...,England,South Africa,2020-01-03,1185305,True,False,True,269,10,0.0


In [6]:
player = 'PJ Cummins'
dfp = df_bowl[ (df_bowl['bowler']==player) ]
dfp

Unnamed: 0,bowler,O,M,R,W,innings,all_overs,0s,1s,2s,...,5s,6s,wds,nbs,date,match_id,Win,draw,Toss,7s
29,PJ Cummins,22.0,6,44,3,2nd innings,"{2: {'R': 0, 'W': 0}, 4: {'R': 0, 'W': 0}, 6: ...",112,11,2,...,0,1,0,0,2020-01-03,1183534,True,False,True,0.0
40,PJ Cummins,11.0,3,29,1,4th innings,"{2: {'R': 1, 'W': 0}, 4: {'R': 0, 'W': 0}, 6: ...",50,8,5,...,0,0,0,0,2020-01-03,1183534,True,False,True,0.0
330,PJ Cummins,21.1,8,48,3,1st innings,"{11: {'R': 1, 'W': 0}, 13: {'R': 0, 'W': 0}, 1...",109,11,6,...,0,1,1,3,2020-12-17,1223869,True,False,False,0.0
339,PJ Cummins,10.2,5,21,4,3rd innings,"{2: {'R': 6, 'W': 0}, 4: {'R': 0, 'W': 1}, 6: ...",53,2,4,...,0,0,0,0,2020-12-17,1223869,True,False,False,0.0
392,PJ Cummins,27.0,9,80,2,2nd innings,"{2: {'R': 0, 'W': 0}, 4: {'R': 5, 'W': 0}, 6: ...",124,16,11,...,0,0,0,0,2020-12-26,1223870,False,False,True,0.0
402,PJ Cummins,5.0,0,22,1,4th innings,"{2: {'R': 7, 'W': 0}, 4: {'R': 2, 'W': 0}, 6: ...",21,3,2,...,0,0,0,0,2020-12-26,1223870,False,False,True,0.0


In [None]:
## Testing the database
df_bat  = pickle.load(open(database_dir+'/batting_record_test_2020.df', 'rb'))
df_bowl = pickle.load(open(database_dir+'/bowling_record_test_2020.df', 'rb'))
df_bowl.head()

In [8]:
df_bat.columns

Index(['batsman', 'runs', 'BF', 'NO', '0s', '1s', '2s', '3s', '4s', '5s', '6s',
       'innings', 'Team', 'Against', 'date', 'match_id', 'Win', 'Draw', 'Toss',
       'runs_tot_inn', 'wkts_tot_inn', '7s'],
      dtype='object')

In [None]:
def get_player_profile(player, batsman=True):
    
    if batsman:
        df = pickle.load(open(database_dir+'/batting_record_test_2020.df', 'rb'))        
        
        data_player=[]
        dfp = df[ df['batsman']==player]
        
        Inns_ = dfp.shape[0]
        if Inns_ > 0:
            Mats_ = len(dfp.match_id.unique())
            Team_ = dfp.Team.values[0]
            NOs_  = int(dfp.NO.sum())
            Runs_ = dfp.Runs.sum()
            BF_   = dfp.BF.sum()
            HS_   = max(dfp.Runs)
            
            if Inns_ == NOs_:
                Ave_  = Runs_
            else:
                Ave_  = np.round(Runs_/(Inns_-NOs_), 2)
                
            SR_   = 0.
                
            Fifty_= ((dfp.Runs>=50) & (dfp.Runs<100) ).sum()
            Hundred_= (dfp.Runs>=100).sum()
            Fours_= dfp['4s'].sum()
            Sixes_= dfp['6s'].sum()
            
            data_player.append([player, Mats_, Inns_, Team_, NOs_, Runs_, BF_, HS_, Ave_, SR_,
                                Fifty_, Hundred_, Fours_, Sixes_] )
            
            data_player.append([player, Team_, Inns_, NOs_, Runs_, BF_, HS_, Ave_, SR_, Fifty_, Hundred_, Fours_, Sixes_] )


 
df_bat  = pickle.load(open( database_dir + '/batting_record_test_2020.df', 'rb'))
df_bowl = pickle.load(open( database_dir + '/bowling_record_test_2020.df', 'rb'))
df_bowl.head()
