Importing the necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import animation
import warnings
warnings.filterwarnings('ignore')
import arviz

In [2]:
def player_list_dict_init():
    """Function returns a dictionary with the column names from the website"""
    player_list_dict = {'Overs':[],'Mdns':[],'Runs':[],'Wkts':[],'Econ':[],'Pos':[],'Inns':[],'Opp':[],'Ground':[],'Date':[],'#':[]}
    return player_list_dict

In [3]:
def player_df_init(url, **dictionary):
    """Function returns a dataframe of the scraped data. Takes in the url of the website and an empty dictionary"""
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    tables = soup.find_all('table',{'class':'engineTable'})
    table = tables[3]
    rows = table.find_all('tr')
    for col in rows:
        elements = col.find_all('td')
        if elements != []:
            dictionary['Overs'].append(elements[0].text.strip())
            dictionary['Mdns'].append(elements[1].text.strip())
            dictionary['Runs'].append(elements[2].text.strip())
            dictionary['Wkts'].append(elements[3].text.strip())
            dictionary['Econ'].append(elements[4].text.strip())
            dictionary['Pos'].append(elements[5].text.strip())
            dictionary['Inns'].append(elements[6].text.strip())
            dictionary['Opp'].append(elements[8].text.strip())
            dictionary['Ground'].append(elements[9].text.strip())
            dictionary['Date'].append(elements[10].text.strip())
            dictionary['#'].append(elements[11].text.strip())
    
    df = pd.DataFrame(dictionary)
    return df

In [4]:
def ave_var(df, col_name):
    """Takes in a dataframe and the column name. It then outputs an array where the variance is calculated 
    from the 0th to i'th record"""
    size = len(df)
    var_arr = np.zeros(size)
    for i in range(size):
        var_arr[i] = np.var(df.loc[0:i,col_name])
    return var_arr

In [5]:
def clean(df):
    """Function takes in a dataframe and returns a cleaned dataframe. Converts columns to correct data types and drops unnecessary
    rows. Then calculates cumulative runs and wickets, match and rolling average"""
    # extracting necessary columns, create additional column innings
    df_rel = df[['Overs','Runs','Wkts']]
    df_rel['Inns'] = df_rel.index + 1
    
    # drop rows where player/team didn' ball or where player was a sub and didn't ball
    row_drop = df_rel[df_rel['Overs'] == 'DNB'].index
    df_rel = df_rel.drop(index = row_drop, axis = 0)
    row_drop = df_rel[df_rel['Overs'] == 'TDNB'].index
    df_rel = df_rel.drop(index = row_drop, axis = 0)
    row_drop = df_rel[df_rel['Overs'] == 'sub'].index
    df_drop = df_rel.drop(index = row_drop, axis = 0)
    
    # converting runs and Wkts to integer data type
    df_drop['Runs'] = df_drop['Runs'].astype(int)
    df_drop['Wkts'] = df_drop['Wkts'].astype(int)
    
    # calculating the cumulative runs and wickets
    df_drop['cum_run'] = df_drop['Runs'].cumsum()
    df_drop['cum_wkt'] = df_drop['Wkts'].cumsum()
    
    # calculating rolling and match average after each innings
    df_drop['ave_roll'] = df_drop['cum_run'] / df_drop['cum_wkt']
    df_drop['ave_match'] = df_drop['Runs'] / df_drop['Wkts']
    stats = df_drop[['Runs','Wkts','Inns','cum_run','cum_wkt','ave_roll','ave_match']]
    
    # drop the rows where the number wickets is 0
    row_drop = stats[stats['Wkts'] == 0].index
    stats = stats.drop(index = row_drop, axis = 0)
    stats = stats.reset_index(drop = True)
    return stats

In [6]:
def bayes_calc(stats_clean):
    """Takes in a dataframe and returns the players average, bayesian average
    standard error, and the two ends of the 90% interval"""
    
    # defining the prior mean and standard deviation
    prior_mean = 31.03
    prior_std = 6.52
    
    # defining the distribution size and and the prior distribution
    dist_size = 100000
    prior = np.random.normal(prior_mean, prior_std, size = dist_size)
    
    # creating a likelihood dataframe l_df selecting relevant columns
    l_df = stats_clean[['ave_var','ave_roll','std_err', 'Inns']]
    size = len(l_df)
    
    # creating an empty array to store the likelihood distribution 
    l_array = np.zeros(shape = (size, dist_size))

    # creating the posterior dictionary to store the mean and standard deviation
    pos_dict_list = {'pos_mean':[],'pos_std':[]}
    
    # creating an empty array to store the posterior distribution 
    pos_array = np.zeros(shape = (size, dist_size))
    
    for i in range(size):
        # assigning the i'th likelihood variance, innings, rolling average, and standard error
        l_var, l_n, l_avg, l_std_err = l_df.loc[i,'ave_var'], l_df.loc[i,'Inns'], l_df.loc[i,'ave_roll'], l_df.loc[i,'std_err']
        
        # calculating the i'th normal distribution
        l_array[i] = np.random.normal(l_avg, l_std_err, size = dist_size)
        
        # calculating and appending the posterior mean and posterior standard deviation
        pos_dict_list['pos_mean'].append(((prior_mean / (prior_std) ** 2) + ((l_n * l_avg) / l_var)) / (prior_std ** -2 + (l_n / l_var)))
        pos_dict_list['pos_std'].append((prior_std ** (-2) + (l_n / l_var)) ** (-0.5))
        
        # calculating the i'th posterior distribution
        pos_array[i] = np.random.normal(pos_dict_list['pos_mean'][i],pos_dict_list['pos_std'][i], size = dist_size)
    
    # calculating the 90% interval
    hpd = arviz.hdi(pos_array[-1], hdi_prob = 0.9)
    
    # rounding relevant statistics to 2 decimal places
    bayes_avg = round(pos_dict_list['pos_mean'][-1], 2)
    avg = round(l_df["ave_roll"].iloc[-1], 2)
    std_err = round(pos_dict_list['pos_std'][-1], 2)
    lower_90 = round(hpd[0], 2)
    upper_90 = round(hpd[1], 2)
    
    return avg, bayes_avg, std_err, lower_90, upper_90

In [7]:
def total(name, role):
    """Takes in the players name and role and ouputs it and other variables
    after calculating bayesian statistics using other 
    previously defined functions"""
    
    # create an empty dictionary 
    player_list_dict = player_list_dict_init()
    
    # create a dataframe of the players innings by innings data
    df = player_df_init(url, **player_list_dict)
    
    # creating a cleaned dataframe
    stats = clean(df)
    
    # calculating variance, standarad error columns
    stats['ave_var'] = ave_var(stats, 'ave_match')
    stats['std_err'] = (stats['ave_var'] / stats['Inns']) ** 0.5
    
    # dropping row where variance is equal to 0
    var_0_index = stats[stats['ave_var'] == 0].index
    stats_clean = stats.drop(index = var_0_index, axis = 0)
    stats_clean = stats_clean.reset_index(drop = True)
    
    # calculating relevant bayesian statistics
    avg, bayes_avg, std_err, lower_90, upper_90 = bayes_calc(stats_clean)
    
    return name, role, avg, bayes_avg, std_err, lower_90, upper_90

In [8]:
# commented code creates an empty dataframe
# bowlers = pd.DataFrame(columns = ["Name",'Role','Avg',"Bayes_Avg","std_err","90%_lower","90%_upper"])
# bowlers.to_csv("bowlers.csv", index = False)

# reads in csv file
bowlers = pd.read_csv('bowlers.csv')

In [9]:
# define the url variable
url = 'https://stats.espncricinfo.com/ci/engine/player/376116.html?class=1;template=results;type=bowling;view=innings'

# defining the bayesian statistics variables
name, role, avg, bayes_avg, std_err, lower_90, upper_90 = total('Umesh Yadav', 'Seamer')

In [10]:
# appending values to the dataframe
bowlers = bowlers.append({'Name':name, 'Role':role ,'Avg':avg,'Bayes_Avg':bayes_avg,'std_err':std_err,\
                "90%_lower":lower_90,"90%_upper":upper_90}, ignore_index = True)
bowlers

Unnamed: 0,Name,Role,Avg,Bayes_Avg,std_err,90%_lower,90%_upper
0,Ravichandran Ashwin,Spinner,23.66,24.3,1.93,21.15,27.48
1,Muthiah Muralidaran,Spinner,22.72,23.28,1.68,20.49,26.01
2,Shane Warne,Spinner,25.42,25.74,1.57,23.22,28.39
3,Anil Kumble,Spinner,29.63,29.73,1.7,26.89,32.49
4,Nathan Lyon,Spinner,30.86,30.87,1.8,27.87,33.81
5,James Anderson,Seamer,26.35,26.52,1.26,24.5,28.62
6,Stuart Broad,Seamer,27.68,27.83,1.37,25.55,30.06
7,Glenn Mcgrath,Seamer,21.64,21.93,1.13,20.09,23.8
8,Courtney Walsh,Seamer,24.45,24.68,1.23,22.64,26.69
9,Dale Steyn,Seamer,22.78,23.44,1.84,20.39,26.46


In [11]:
# write as a csv file when needed
# bowlers.to_csv("bowlers.csv", index = False)

In [12]:
# selecting only the seam bowlers 
bowlers[bowlers['Role'] == 'Seamer'].sort_values('Bayes_Avg', ascending = True)

Unnamed: 0,Name,Role,Avg,Bayes_Avg,std_err,90%_lower,90%_upper
15,Curtly Ambrose,Seamer,20.99,21.39,1.31,19.23,23.54
17,Malcolm Marshall,Seamer,20.95,21.54,1.58,18.97,24.16
7,Glenn Mcgrath,Seamer,21.64,21.93,1.13,20.09,23.8
32,Vernon Philander,Seamer,22.32,22.89,1.68,20.11,25.63
27,Kagisio Rabada,Seamer,22.35,23.08,1.89,19.89,26.09
35,Jasprit Bumrah,Seamer,21.99,23.14,2.33,19.36,26.99
12,Shaun Pollock,Seamer,23.12,23.42,1.27,21.35,25.49
9,Dale Steyn,Seamer,22.78,23.44,1.84,20.39,26.46
29,Pat Cummins,Seamer,22.85,23.61,1.98,20.36,26.89
14,Wasim Akram,Seamer,23.49,23.9,1.53,21.41,26.44


In [13]:
# selecting only the spin bowlers
bowlers[bowlers['Role'] == 'Spinner'].sort_values('Bayes_Avg', ascending = True)

Unnamed: 0,Name,Role,Avg,Bayes_Avg,std_err,90%_lower,90%_upper
1,Muthiah Muralidaran,Spinner,22.72,23.28,1.68,20.49,26.01
0,Ravichandran Ashwin,Spinner,23.66,24.3,1.93,21.15,27.48
28,Ravindra Jadeja,Spinner,24.04,24.58,1.82,21.56,27.55
2,Shane Warne,Spinner,25.42,25.74,1.57,23.22,28.39
11,Rangana Herath,Spinner,28.08,28.42,2.23,24.81,32.13
3,Anil Kumble,Spinner,29.63,29.73,1.7,26.89,32.49
4,Nathan Lyon,Spinner,30.86,30.87,1.8,27.87,33.81
13,Harbhajan Singh,Spinner,32.46,32.34,1.89,29.28,35.5
