# Batting Stats in IPL

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob, yaml, pickle
import datetime

from utils import get_player_profile, get_match_list

from print_scorecard import print_scorecard

data_dir = '../datasets/t20s_male/'

In [2]:
def get_tri_series_match_ids():
    fil = data_dir +'/README.txt'
    f_in = open(fil, 'r')

    lines= f_in.readlines()
    matches = []
    for line in lines:  # reversed to get the games sorted with date
        if (len(line)>2 and line[:2]=='20'):
            line_  = line.strip('\n').split(' - ')
            date = line_[0]
            match_id = line_[4]
            match_teams = line_[5].strip().split('vs')
            matches.append([date, match_id, match_teams[0].strip(), match_teams[1].strip()])
            
    df_matches = pd.DataFrame(matches, columns=['date', 'match_id', 'team1', 'team2'])
    df_matches['date'] = pd.to_datetime(df_matches['date'])
    
    df_last_10 = df_matches[:10]
    
    df_tri  = df_last_10[((df_last_10['team1']=="Nepal") |
                          (df_last_10['team1']=="Netherlands") |
                          (df_last_10['team1']=="Malaysia"))]

    return df_tri
    
df_tri = get_tri_series_match_ids()
df_tri

Unnamed: 0,date,match_id,team1,team2
0,2021-04-24,1257951,Nepal,Netherlands
2,2021-04-22,1257950,Nepal,Malaysia
4,2021-04-21,1257949,Netherlands,Malaysia
5,2021-04-20,1257948,Nepal,Netherlands
6,2021-04-19,1257947,Malaysia,Nepal
7,2021-04-18,1257946,Netherlands,Malaysia
8,2021-04-17,1257945,Netherlands,Nepal


In [3]:
def get_all_data():
    df_bat_all  = pd.DataFrame({}, columns=['batsman', '0s', '1s', '2s', '3s', '4s', '5s', '6s', '7s',
                                            'Runs', 'BF', 'NO', 'Team', 'Against', 'Tied', 'Win',
                                            'Toss','team-total', 'season', 'date', 'match-id' ])

    df_bowl_all = pd.DataFrame({}, columns=['bowler', 'O', 'M', 'Runs', 'W', 'ovs',
                                            '0s', '1s', '2s', '3s', '4s', '5s', '6s', '7s',
                                            'WD', 'NB', 'Team', 'Against', 'Tied', 'Win', 'Toss',
                                            'team-total', 'season', 'date', 'match-id'] )

    for match_id in df_tri['match_id'].values:
        print (match_id)
        df_bat, df_bowl = print_scorecard(str(match_id)+'.yaml', data_dir=data_dir)

        df_bat_all  = pd.concat([df_bat_all,  df_bat] , ignore_index=True)
        df_bowl_all = pd.concat([df_bowl_all, df_bowl], ignore_index=True)
        
    return df_bat_all, df_bowl_all

df_bat_tri, df_bowl_tri = get_all_data()


1257951
1257950
1257949
1257948
1257947
1257946
1257945


In [36]:
def add_overs(overs):
    # given a list of overs returns total over
    # example: [1.2, ]
    ov_int = 0
    ov_frac = 0

    dtype = str(type(overs[0]))
    for ov in overs:
        ov = str(ov)
        ov_split = ov.split('.')
        ov_int  += int(ov_split[0])

        if len(ov_split)>1:
            ov_frac += int(ov_split[1])

        if ov_frac >= 6:
            ov_int += ov_frac//6
            ov_frac = ov_frac %6

    if ov_frac==0:
        if 'str' not in dtype:
            return ov_int
        else:
            return str(ov_int)
    else:
        if 'str' not in dtype:
            return ov_int+ov_frac*0.1
        else:
            return str(ov_int)+'.'+str(ov_frac)

def Over2Balls(Over):
    ov_split = Over.split('.')
    all_balls = int(ov_split[0])*6

    if len(ov_split) > 1:
        all_balls += int( ov_split[1] )
    return all_balls

def get_player_profile(player, batsman=True):
    if batsman:
        df = df_bat_tri.copy()
        
        data_player=[]
        dfp   = df[ (df['batsman']==player) ]
        Inns_ = dfp.shape[0]

        if Inns_ != 0:
            Team_ = dfp.Team.values[0]
            Runs_ = dfp.Runs.sum()
            BF_   = dfp.BF.sum()
            NOs_  = int(dfp.NO.sum())
            HS_   = max(dfp.Runs)
            SR_   = np.round(100*Runs_/BF_, 2)
            if Inns_ == NOs_:
                Ave_  = Runs_
            else:
                Ave_  = np.round(Runs_/(Inns_-NOs_), 2)
            Fours_= dfp['4s'].sum()
            Sixes_= dfp['6s'].sum()
            Fifty_= ((dfp.Runs>=50) & (dfp.Runs<100) ).sum()
            Hundred_= (dfp.Runs>=100).sum()

            data_player.append([player, Team_, Inns_, NOs_, Runs_, BF_, HS_, Ave_,
                                SR_, Fifty_, Hundred_, Fours_, Sixes_] )
        df_p = pd.DataFrame(data_player, columns=['batsman', 'Team', 'Innings', 'NO',
                                                  'Runs', 'BF', 'HS', 'Ave','SR', '50s',
                                                  '100s', '4s', '6s'])
        return df_p

    else:
        df = df_bowl_tri.copy()

        data_player=[]
        dfp   = df[ (df['bowler']==player) ]
        Inns_ = dfp.shape[0]

        if Inns_ != 0:
            Team_ = dfp.Team.values[0]
            Ovs_  = add_overs(dfp['O'].values)
            Wkts_ = dfp['W'].sum()
            Runs_ = dfp['Runs'].sum()

            Balls_ = Over2Balls(Ovs_)

            SR_   = 0.
            Ave_  = 0.
            if Wkts_ > 0:
                SR_   = round( Balls_ / Wkts_ , 2 )
                Ave_  = round( Runs_ / Wkts_  , 2 )

            Fours_= dfp['4s'].sum()
            Sixes_= dfp['6s'].sum()
            WDs_  = dfp['WD'].sum()
            NBs_  = dfp['NB'].sum()

            NoWs_= ((dfp['W']==0)).sum()
            ThreeWs_= ((dfp['W']>=3)).sum()
            FourWs_ = ((dfp['W']>=4)).sum()
            FiveWs_ = ((dfp['W']>=5)).sum()

            data_player.append([player, Team_, Inns_, Ovs_, Wkts_, Runs_, SR_, Ave_, Fours_, Sixes_,
                                WDs_, NBs_, NoWs_, ThreeWs_, FourWs_, FiveWs_])

        df_p = pd.DataFrame(data_player, columns=['bowler', 'Team', 'Innings', 'Overs', 'Wickets', 
                                                  'Runs', 'SR', 'Ave', 'Fours', 'Sixes', 'WDs', 'NBs',
                                                  '0-Fers', '3-Fers', '4-Fers', '5-Fers'])
        return df_p

In [37]:
player = 'K Bhurtel'
get_player_profile(player)

Unnamed: 0,batsman,Team,Innings,NO,Runs,BF,HS,Ave,SR,50s,100s,4s,6s
0,K Bhurtel,Nepal,5,1,278,198,77,69.5,140.4,4,0,24,16


In [38]:
player = 'S Lamichhane'
get_player_profile(player, batsman=False)

Unnamed: 0,bowler,Team,Innings,Overs,Wickets,Runs,SR,Ave,Fours,Sixes,WDs,NBs,0-Fers,3-Fers,4-Fers,5-Fers
0,S Lamichhane,Nepal,5,19.5,13,163,9.15,12.54,20,8,5,0,0,2,1,0


In [39]:
def get_full_batting_profile():    
    all_batsman = df_bat_tri['batsman'].unique()
    df_bat_all  = pd.DataFrame({}, columns=['batsman', 'Team', 'Innings', 'NO', 'Runs',
                                            'BF', 'HS', 'Ave','SR', '50s', '100s', '4s', '6s'])
    for batsman in all_batsman:
        data = get_player_profile(batsman)
        df_bat_all  = pd.concat([df_bat_all,  data] , ignore_index=True)

    return df_bat_all

def get_full_bowling_profile():
    all_bowler = df_bowl_tri['bowler'].unique()
    df_bowl_all = pd.DataFrame({}, columns=['bowler', 'Team', 'Innings', 'Overs', 'Wickets',
                                            'Runs','SR', 'Ave', 'Fours', 'Sixes', 'WDs', 'NBs',
                                            '0-Fers', '3-Fers', '4-Fers', '5-Fers'])
    for bowler in all_bowler:
        data = get_player_profile(bowler, batsman=False)
        df_bowl_all  = pd.concat([df_bowl_all,  data] , ignore_index=True)

    return df_bowl_all

In [40]:
get_full_bowling_profile().head(2)

Unnamed: 0,bowler,Team,Innings,Overs,Wickets,Runs,SR,Ave,Fours,Sixes,WDs,NBs,0-Fers,3-Fers,4-Fers,5-Fers
0,PA van Meekeren,Netherlands,4,12,0,116,0.0,0.0,10,8,9,1,4,0,0,0
1,VJ Kingma,Netherlands,5,16,3,160,32.0,53.33,17,8,8,5,3,0,0,0


In [41]:
def top_batsman(sort_by='Runs'):
    df = get_full_batting_profile()
    return df.sort_values(by=[sort_by], ascending=False).head(8)

def top_bowler(sort_by='Wickets'):
    df = get_full_bowling_profile()
    to_drop = ['0-Fers', '5-Fers', 'WDs', 'NBs']
    df.drop(to_drop, axis=1, inplace=True)
    return df.sort_values(by=[sort_by], ascending=False).head(8)

display ( top_batsman(sort_by='Runs') )
display ( top_bowler(sort_by='Wickets'))


Unnamed: 0,batsman,Team,Innings,NO,Runs,BF,HS,Ave,SR,50s,100s,4s,6s
0,K Bhurtel,Nepal,5,1,278,198,77,69.5,140.4,4,0,24,16
5,MP O'Dowd,Netherlands,4,1,172,111,133,57.33,154.95,0,1,21,7
1,Aasif Sheikh,Nepal,5,1,154,111,54,38.5,138.74,1,0,12,10
4,DS Airee,Nepal,4,2,147,68,60,73.5,216.18,1,0,14,9
7,BN Cooper,Netherlands,5,2,136,94,55,45.33,144.68,2,0,20,2
8,BFW de Leede,Netherlands,4,1,123,96,81,41.0,128.12,1,0,10,7
22,Virandeep Singh,Malaysia,4,0,120,88,87,30.0,136.36,1,0,12,6
2,G Malla,Nepal,4,1,107,65,41,35.67,164.62,0,0,6,8


Unnamed: 0,bowler,Team,Innings,Overs,Wickets,Runs,SR,Ave,Fours,Sixes,3-Fers,4-Fers
9,S Lamichhane,Nepal,5,19.5,13,163,9.15,12.54,20,8,2,1
8,Karan KC,Nepal,5,17.3,7,115,15.0,16.43,14,3,2,0
5,TS Braat,Netherlands,3,8.0,5,92,9.6,18.4,3,9,0,0
6,Sompal Kami,Nepal,5,16.0,5,85,19.2,17.0,12,1,1,0
7,KS Airee,Nepal,2,7.1,5,38,8.6,7.6,2,1,1,0
3,BFW de Leede,Netherlands,5,12.0,4,131,18.0,32.75,15,4,0,0
2,PRP Boissevain,Netherlands,3,11.0,3,117,22.0,39.0,9,7,0,0
4,PM Seelaar,Netherlands,4,12.0,3,78,24.0,26.0,4,5,0,0


In [None]:
player = 'K Bhurtel'
df_bat[df_bat['batsman']==player]

In [None]:
match_id=1257951
df_bat, df_bowl = print_scorecard(str(match_id)+'.yaml', data_dir=data_dir)
df_bat.head(2)


In [None]:
# 0 2021-04-24	1257951	Nepal	Netherlands
# 1	2021-04-23	1257184	Zimbabwe	Pakistan
# 2	2021-04-22	1257950	Nepal	Malaysia
# 3	2021-04-21	1257183	Pakistan	Zimbabwe
# 4	2021-04-21	1257949	Netherlands	Malaysia
# 5	2021-04-20	1257948	Nepal	Netherlands
# 6	2021-04-19	1257947	Malaysia	Nepal
# 7	2021-04-18	1257946	Netherlands	Malaysia
# 8	2021-04-17	1257945	Netherlands	Nepal

In [None]:
def consistency_(th_runs=30, min_runs=1000):
    df = pickle.load(open(data_dir+'batting.df', 'rb'))
    all_players = df['batsman'].unique()
    
    data=[]
    for player in all_players:
        dfp     = df[ df['batsman'] == player ]
        Inns_   = dfp.shape[0]
        Runs_   = dfp.Runs.sum()
        BF_     = dfp.BF.sum()
        Inns_th = dfp[dfp['Runs'] >= th_runs].shape[0]
        
        if (Runs_ >= min_runs):
            th_plus = dfp[dfp['Runs'] >= th_runs]['Runs'].shape[0]
            Runs_th = dfp[dfp['Runs'] >= th_runs]['Runs'].sum()
            
            mu_R      = np.mean(dfp['Runs'])
            sigma_R   = np.std(dfp['Runs'])
            sigma_mu_R= sigma_R/mu_R

            mu_B      = np.mean(dfp['BF'])
            sigma_B   = np.std(dfp['BF'])
            sigma_mu_B= sigma_B/mu_B
            
            RpI     = Runs_/Inns_ 
            BpI     = BF_/Inns_
            data.append([player, Inns_, Inns_th, Runs_, Runs_th, BF_, th_plus, RpI, BpI, mu_R, sigma_R, sigma_mu_R,
                         mu_B, sigma_B, sigma_mu_B])

    df_ = pd.DataFrame( data, columns=[ 'player', 'Innings', 'Innings_th', 'Runs', 'Runs_th', 'BF',
                                       'Th_plus', 'RpI', 'BpI', 'mu_R', 'sigma_R', 'sigma_ov_mu_R',
                                       'mu_B', 'sigma_B', 'sigma_ov_mu_B' ])
    df_['name_and_thplus'] = df_['player']+" ("+df_['Th_plus'].astype(str)+")"
    df_sorted  = df_.sort_values(by=['RpI'], ascending=False)

    return df_sorted

dfc=consistency_()
dfc

In [None]:
def Plot_sigma_mu():
    df_c=consistency_(th_runs=30, min_runs=2000)
    plt.figure(figsize=(20, 8))
    plt.plot(df_c['sigma_ov_mu_R'], df_c['mu_R'], '*', markersize=18, color='purple')
    for i in range(df_c.shape[0]):
        plt.text(0.98*df_c['sigma_ov_mu_R'][i], 0.97*df_c['mu_R'][i], df_c['name_and_thplus'][i], fontsize=16, color='maroon')
    plt.xticks([0.72, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0], fontsize=18);
    plt.yticks([20, 22, 24, 26, 28, 30, 32, 34, 36, 38], fontsize=18);
    plt.xlabel('Normalized Standard Deviation', fontsize=20)
    plt.ylabel('Runs Per Innings',   fontsize=20)
    plt.grid()
Plot_sigma_mu()

## strike rate (SR) consistency

In [None]:
player='SK Raina'
dfs=get_player_profile(player)
dfs