In [1]:
import pandas as pd
import numpy as np
import glob
import os
import math
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

Import cricsheet ball-by-ball data for Men's test matches and store in dataframes (one for each match/csv file). 

In [2]:
filepath = '/Users/lkc207/Documents/Cricket/Match_prediction/cricsheet_data/test_csv_male/'
all_files = glob.glob(filepath + "/*.csv")

df_list=[]

for filename in all_files:
    df = pd.read_csv(filename, skiprows=0,  names = ["Ball", "Innings", "Over", "Batting_team","Batter","Non-striker","Bowler","Runs","Extras","How_out","Batter_out"])
    file_path, file_extension = os.path.splitext(filename)
    match_id = file_path.split('/')[-1]
    df["Match_id"] = match_id  ### add the match_id extracted from filename as a column
    df["Start_date"] = df.loc[(df['Innings']=='date') & (df['Match_id']==match_id)]['Over'].iloc[0]
    df_list.append(df)

Concatenate list of dataframes into one df and reset index.

In [3]:
df_all = pd.concat(df_list).reset_index(drop=True)
df_all

Unnamed: 0,Ball,Innings,Over,Batting_team,Batter,Non-striker,Bowler,Runs,Extras,How_out,Batter_out,Match_id,Start_date
0,version,1.3.0,,,,,,,,,,522245,2011/08/04
1,info,team,Zimbabwe,,,,,,,,,522245,2011/08/04
2,info,team,Bangladesh,,,,,,,,,522245,2011/08/04
3,info,gender,male,,,,,,,,,522245,2011/08/04
4,info,season,2011,,,,,,,,,522245,2011/08/04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249238,ball,3,30.3,Sri Lanka,TT Samaraweera,HAPW Jayawardene,Umar Gul,0.0,1.0,,,388993,2009/02/21
1249239,ball,3,30.4,Sri Lanka,TT Samaraweera,HAPW Jayawardene,Umar Gul,4.0,0.0,,,388993,2009/02/21
1249240,ball,3,30.5,Sri Lanka,TT Samaraweera,HAPW Jayawardene,Umar Gul,1.0,0.0,,,388993,2009/02/21
1249241,ball,3,30.6,Sri Lanka,HAPW Jayawardene,TT Samaraweera,Umar Gul,0.0,0.0,,,388993,2009/02/21


Create dictionary of venues and corresponding country -- to be able to identify home team
The country of each venue is entered into the dictionary manually.

In [4]:
#### create dictionary of venues and corresponding country -- to be able to identify home team?
venues = df_all[df_all['Innings']=='venue']['Over'].unique()
venues_dict = dict.fromkeys(venues)

### set country of each venue manually.

venues_dict['Bellerive Oval'] = 'Australia'
venues_dict['Manuka Oval'] = 'Australia'
venues_dict['Adelaide Oval'] = 'Australia'
venues_dict['Sydney Cricket Ground'] = 'Australia'
venues_dict['Melbourne Cricket Ground'] = 'Australia'
venues_dict['Brisbane Cricket Ground, Woolloongabba'] = 'Australia'
venues_dict['Western Australia Cricket Association Ground'] = 'Australia'
venues_dict['Perth Stadium'] = 'Australia'
venues_dict['W.A.C.A. Ground'] = 'Australia'

venues_dict['Chittagong Divisional Stadium'] = 'Bangladesh'
venues_dict['Zohur Ahmed Chowdhury Stadium'] = 'Bangladesh'
venues_dict['Zahur Ahmed Chowdhury Stadium'] = 'Bangladesh'
venues_dict['Shere Bangla National Stadium'] = 'Bangladesh'
venues_dict['Sheikh Abu Naser Stadium'] = 'Bangladesh'
venues_dict['Shere Bangla National Stadium, Mirpur'] = 'Bangladesh'
venues_dict['Shaheed Chandu Stadium'] = 'Bangladesh'
venues_dict['Khan Shaheb Osman Ali Stadium'] = 'Bangladesh'
venues_dict['Sylhet International Cricket Stadium'] = 'Bangladesh'

venues_dict['Old Trafford'] = 'England'
venues_dict['Headingley'] = 'England'
venues_dict['Edgbaston'] = 'England'
venues_dict['Lord\'s'] = 'England'
venues_dict['Trent Bridge'] = 'England'
venues_dict['The Rose Bowl'] = 'England'
venues_dict['Riverside Ground'] = 'England'
venues_dict['Kennington Oval'] = 'England'
venues_dict['Sophia Gardens'] = 'England'

venues_dict['Eden Gardens'] = 'India'
venues_dict['Wankhede Stadium'] = 'India'
venues_dict['M Chinnaswamy Stadium'] = 'India'
venues_dict['M.Chinnaswamy Stadium'] = 'India'
venues_dict['Green Park'] = 'India'
venues_dict['Brabourne Stadium'] = 'India'
venues_dict['Sardar Patel Stadium, Motera'] = 'India'
venues_dict['Sardar Patel Stadium'] = 'India'
venues_dict['Punjab Cricket Association Stadium, Mohali'] = 'India'
venues_dict['Punjab Cricket Association IS Bindra Stadium, Mohali'] = 'India'
venues_dict['MA Chidambaram Stadium, Chepauk'] = 'India'
venues_dict['Feroz Shah Kotla'] = 'India'
venues_dict['Arun Jaitley Stadium'] = 'India'
venues_dict['Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium'] = 'India'
venues_dict['Saurashtra Cricket Association Stadium'] = 'India'
venues_dict['Rajiv Gandhi International Stadium, Uppal'] = 'India'
venues_dict['JSCA International Stadium Complex'] = 'India'
venues_dict['Rajiv Gandhi International Cricket Stadium, Dehradun'] = 'India'
venues_dict['Holkar Cricket Stadium'] = 'India'
venues_dict['Vidarbha C.A. Ground'] = 'India'
venues_dict['Maharashtra Cricket Association Stadium'] = 'India'
venues_dict['Himachal Pradesh Cricket Association Stadium'] = 'India'
venues_dict['Vidarbha Cricket Association Stadium, Jamtha'] = 'India'
venues_dict['Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium'] = 'India'

venues_dict['The Village, Malahide'] = 'Ireland'

venues_dict['University Oval'] = 'New Zealand'
venues_dict['Hagley Oval'] = 'New Zealand'
venues_dict['Eden Park'] = 'New Zealand'
venues_dict['Bay Oval'] = 'New Zealand'
venues_dict['Seddon Park'] = 'New Zealand'
venues_dict['McLean Park'] = 'New Zealand'
venues_dict['Basin Reserve'] = 'New Zealand'
venues_dict['AMI Stadium'] = 'New Zealand'

venues_dict['Rawalpindi Cricket Stadium'] = 'Pakistan'
venues_dict['Gaddafi Stadium'] = 'Pakistan'
venues_dict['Iqbal Stadium'] = 'Pakistan'
venues_dict['Multan Cricket Stadium'] = 'Pakistan'
venues_dict['National Stadium'] = 'Pakistan'

venues_dict['Newlands'] = 'South Africa'
venues_dict['Kingsmead'] = 'South Africa'
venues_dict['St George\'s Park'] = 'South Africa'
venues_dict['Mangaung Oval'] = 'South Africa'
venues_dict['OUTsurance Oval'] = 'South Africa'
venues_dict['SuperSport Park'] = 'South Africa'
venues_dict['The Wanderers Stadium'] = 'South Africa'
venues_dict['New Wanderers Stadium'] = 'South Africa'
venues_dict['Senwes Park'] = 'South Africa'

venues_dict['R.Premadasa Stadium, Khettarama'] = 'Sri Lanka'
venues_dict['R.Premadasa Stadium'] = 'Sri Lanka'
venues_dict['R Premadasa Stadium'] = 'Sri Lanka'
venues_dict['Pallekele International Cricket Stadium'] = 'Sri Lanka'
venues_dict['Galle International Stadium'] = 'Sri Lanka'
venues_dict['P Sara Oval'] = 'Sri Lanka'
venues_dict['Sinhalese Sports Club Ground'] = 'Sri Lanka'
venues_dict['P Saravanamuttu Stadium'] = 'Sri Lanka'
venues_dict['Asgiriya Stadium'] = 'Sri Lanka'

venues_dict['Sheikh Zayed Stadium'] = 'UAE'
venues_dict['Dubai International Cricket Stadium'] = 'UAE'
venues_dict['Sharjah Cricket Stadium'] = 'UAE'

venues_dict['Queen\'s Park Oval, Port of Spain'] = 'West Indies'
venues_dict['Sir Vivian Richards Stadium, North Sound'] = 'West Indies'
venues_dict['Daren Sammy National Cricket Stadium, St Lucia'] = 'West Indies'
venues_dict['Daren Sammy National Cricket Stadium, Gros Islet'] = 'West Indies'
venues_dict['Darren Sammy National Cricket Stadium, Gros Islet'] = 'West Indies'
venues_dict['Beausejour Stadium, Gros Islet'] = 'West Indies'
venues_dict['Kensington Oval, Bridgetown'] = 'West Indies'
venues_dict['Kensington Oval'] = 'West Indies'
venues_dict['Kensington Oval, Barbados'] = 'West Indies'
venues_dict['Sabina Park, Jamaica'] = 'West Indies'
venues_dict['Sabina Park, Kingston'] = 'West Indies'
venues_dict['Arnos Vale Ground, Kingstown'] = 'West Indies'
venues_dict['Warner Park, Basseterre'] = 'West Indies'
venues_dict['Antigua Recreation Ground, St John\'s'] = 'West Indies'
venues_dict['Antigua Recreation Ground'] = 'West Indies'
venues_dict['Windsor Park, Roseau'] = 'West Indies'
venues_dict['Providence Stadium'] = 'West Indies'
venues_dict['National Cricket Stadium, St George\'s'] = 'West Indies'
venues_dict['Sir Vivian Richards Stadium, Antigua'] = 'West Indies'

venues_dict['Harare Sports Club'] = 'Zimbabwe'
venues_dict['Queens Sports Club'] = 'Zimbabwe'

#### Check all venues have been assigned their country
for key in venues_dict.keys():
    if not venues_dict[key]:
        print(key,' , ',venues_dict[key])

Remove test matches with unusual features that may impact the modelling

In [5]:
#### Match_id 225258 was awarded to England (Pakistan ball tamepring?) Remove this from records for now
df_all = df_all[df_all['Match_id'] != '225258']

In [6]:
#### Match_id 1183534 involves penalty runs. Remove this from records for now
df_all = df_all[df_all['Match_id'] != '1183534']

In [7]:
#### Match_id 1140386 involves penalty runs. Remove this from records for now
df_all = df_all[df_all['Match_id'] != '1140386']

In [8]:
#### Match_id 1030217 involves penalty runs. Remove this from records for now
df_all = df_all[df_all['Match_id'] != '1030217']

If there was no 4th innings in the match and the match was not won by an innings then assume that the match was weather affected and so remove these records for now as this may affect models.

In [9]:
df_temp = df_all[df_all['Innings'].isin(['1','2','3','4'])][['Match_id','Innings']]
df_temp = df_temp.astype({'Innings': int})

df_max_innings = df_temp.groupby(['Match_id'])[['Innings']].max().reset_index()
no4th_matchid = df_max_innings[df_max_innings['Innings']<4]['Match_id'].tolist()
#print(no4th_matchid, len(no4th_matchid))
winbyinn_match_id = df_all[(df_all['Match_id'].isin(no4th_matchid)) & (df_all['Innings']=='winner_innings')  & \
       (df_all['Over']=='1')]['Match_id'].tolist()
#print(winbyinn_match_id , len(winbyinn_match_id ))
cutshort_match_id = list(set(no4th_matchid) - set(winbyinn_match_id))
#print(cutshort_match_id , len(cutshort_match_id ))

df_all = df_all[~df_all['Match_id'].isin(cutshort_match_id)]

### delete df_temp
del df_temp

Count number of runs scored, wickets taken and overs bowled in each innings and store in columns in df

In [10]:
df_temp = df_all.groupby(['Match_id','Innings'])[['Runs','Extras']].sum().reset_index()
df_temp['Total_runs']=df_temp['Runs']+df_temp['Extras']
df_temp.drop(df_temp[df_temp['Total_runs']==0].index, inplace=True)


df_temp2 = df_all.groupby(['Match_id','Innings'])['How_out'].count().reset_index()
df_temp2.rename(columns={"How_out": "Wickets"},inplace=True)
df_temp2 = df_temp2[df_temp2['Innings'].isin(['1','2','3','4'])]

df_new_a = pd.DataFrame(columns = ['Match_id', 'Innings1_runs', 'Innings1_wickets','Innings2_runs', 'Innings2_wickets',\
'Innings3_runs', 'Innings3_wickets','Innings4_runs', 'Innings4_wickets','Innings1_overs','Innings2_overs',\
                                   'Innings3_overs','Innings4_overs'])

df_new_a['Match_id'] = df_temp2['Match_id'].unique()

df_new_a['Innings1_runs'] = df_temp.pivot(index='Match_id',columns=['Innings'])['Total_runs']['1'].tolist()
df_new_a['Innings2_runs'] = df_temp.pivot(index='Match_id',columns=['Innings'])['Total_runs']['2'].tolist()
df_new_a['Innings3_runs'] = df_temp.pivot(index='Match_id',columns=['Innings'])['Total_runs']['3'].tolist()
df_new_a['Innings4_runs'] = df_temp.pivot(index='Match_id',columns=['Innings'])['Total_runs']['4'].tolist()

df_new_a['Innings1_wickets'] = df_temp2.pivot(index='Match_id',columns=['Innings'])['Wickets']['1'].tolist()
df_new_a['Innings2_wickets'] = df_temp2.pivot(index='Match_id',columns=['Innings'])['Wickets']['2'].tolist()
df_new_a['Innings3_wickets'] = df_temp2.pivot(index='Match_id',columns=['Innings'])['Wickets']['3'].tolist()
df_new_a['Innings4_wickets'] = df_temp2.pivot(index='Match_id',columns=['Innings'])['Wickets']['4'].tolist()

df_temp4 = df_all[df_all['Innings'].isin(['1','2','3','4'])]
df_temp4 = df_temp4.astype({'Over': float})
df_temp4 = df_temp4.groupby(['Match_id','Innings'])[['Over']].max().reset_index()

df_new_a['Innings1_overs'] = df_temp4.pivot(index='Match_id',columns=['Innings'])['Over']['1'].tolist()
df_new_a['Innings2_overs'] = df_temp4.pivot(index='Match_id',columns=['Innings'])['Over']['2'].tolist()
df_new_a['Innings3_overs'] = df_temp4.pivot(index='Match_id',columns=['Innings'])['Over']['3'].tolist()
df_new_a['Innings4_overs'] = df_temp4.pivot(index='Match_id',columns=['Innings'])['Over']['4'].tolist()

df_new_a

### delete df_temp
del df_temp
del df_temp2
del df_temp4

Extract and store the following information: Match_id, Home_team (team name), Away_team (team name), Start_date, Toss_winner (team name), Toss_decision (field or bat), Bat_first (team name), Field_first (team name), Outcome (team name or draw), By_innings (boolean value), Margin. Most variables should be self explantory.

In [11]:
###This is start of more efficient method but still need to extract players (see cell below)

df_new = pd.DataFrame(columns = ['Match_id', 'Home_team', 'Away_team','Toss_winner',\
                                     'Toss_decision','Bat_first','Field_first','Outcome','By_innings','Margin'])

df_new['Match_id'] = df_all['Match_id'].unique()
df_new['Toss_winner'] = df_all.loc[(df_all['Innings']=='toss_winner')]['Over'].reset_index(drop=True)
df_new['Toss_decision'] = df_all.loc[(df_all['Innings']=='toss_decision')]['Over'].reset_index(drop=True)

df_startdate = df_all.groupby(['Match_id'])['Start_date'].first().reset_index()

venue = df_all[df_all['Innings']=='venue']['Over'].reset_index(drop=True)

home_countries = [venues_dict[x] for x in venue.values]

df_new['Venue_country'] = home_countries

team1 = df_all.loc[(df_all['Innings']=='team')]['Over'].iloc[0::2].values
team2 = df_all.loc[(df_all['Innings']=='team')]['Over'].iloc[1::2].values

df_new['Team1'] = team1
df_new['Team2'] = team2

df_new.loc[df_new['Team1']==df_new['Venue_country'],'Home_team']= df_new.loc[df_new['Team1']==df_new['Venue_country'],'Team1']
df_new.loc[df_new['Team2']==df_new['Venue_country'],'Home_team']= df_new.loc[df_new['Team2']==df_new['Venue_country'],'Team2']
df_new.loc[df_new['Team1']==df_new['Venue_country'],'Away_team']= df_new.loc[df_new['Team1']==df_new['Venue_country'],'Team2']
df_new.loc[df_new['Team2']==df_new['Venue_country'],'Away_team']= df_new.loc[df_new['Team2']==df_new['Venue_country'],'Team1']

df_new.loc[df_new['Toss_decision']=='field','Field_first']=df_new.loc[df_new['Toss_decision']=='field','Toss_winner']
df_new.loc[df_new['Toss_decision']=='bat','Bat_first']=df_new.loc[df_new['Toss_decision']=='bat','Toss_winner']

df_new.loc[(df_new['Toss_decision']=='field') & (df_new['Field_first']==df_new['Team1']),'Bat_first']=\
df_new.loc[(df_new['Toss_decision']=='field') & (df_new['Field_first']==df_new['Team1']),'Team2']

df_new.loc[(df_new['Toss_decision']=='field') & (df_new['Field_first']==df_new['Team2']),'Bat_first']=\
df_new.loc[(df_new['Toss_decision']=='field') & (df_new['Field_first']==df_new['Team2']),'Team1']

df_new.loc[(df_new['Toss_decision']=='bat') & (df_new['Bat_first']==df_new['Team1']),'Field_first']=\
df_new.loc[(df_new['Toss_decision']=='bat') & (df_new['Bat_first']==df_new['Team1']),'Team2']

df_new.loc[(df_new['Toss_decision']=='bat') & (df_new['Bat_first']==df_new['Team2']),'Field_first']=\
df_new.loc[(df_new['Toss_decision']=='bat') & (df_new['Bat_first']==df_new['Team2']),'Team1']

df_new['Outcome'] = df_all.loc[(df_all['Innings']=='winner') | (df_all['Innings']=='outcome') ]['Over'].reset_index(drop=True)

by_an_innings_matches = df_all[(df_all['Innings']=='winner_innings')]['Match_id'].values

df_new.loc[df_new['Match_id'].isin(by_an_innings_matches),'By_innings'] = 1
df_new.loc[~df_new['Match_id'].isin(by_an_innings_matches),'By_innings'] = 0

df_new.loc[df_new['Outcome']!='draw','Margin']= (df_all.loc[(df_all['Innings']=='winner_runs') | (df_all['Innings']=='winner_wickets')]['Over']+', '+\
df_all.loc[(df_all['Innings']=='winner_runs') | (df_all['Innings']=='winner_wickets')]['Innings']).values

df_new = df_new.merge(df_startdate, how='inner', on='Match_id' )

df_new.drop(columns=['Team1','Team2','Venue_country'],inplace=True)

df_new

Unnamed: 0,Match_id,Home_team,Away_team,Toss_winner,Toss_decision,Bat_first,Field_first,Outcome,By_innings,Margin,Start_date
0,522245,Zimbabwe,Bangladesh,Bangladesh,field,Zimbabwe,Bangladesh,Zimbabwe,0,"130, winner_runs",2011/08/04
1,298794,South Africa,New Zealand,New Zealand,bat,New Zealand,South Africa,South Africa,1,"59, winner_runs",2007/11/16
2,913613,England,Sri Lanka,Sri Lanka,field,England,Sri Lanka,England,1,"88, winner_runs",2016/05/19
3,1144165,South Africa,Sri Lanka,South Africa,bat,South Africa,Sri Lanka,Sri Lanka,0,"8, winner_wickets",2019/02/21
4,456671,Sri Lanka,India,Sri Lanka,bat,Sri Lanka,India,India,0,"5, winner_wickets",2010/08/03
...,...,...,...,...,...,...,...,...,...,...,...
561,521225,Sri Lanka,England,Sri Lanka,bat,Sri Lanka,England,Sri Lanka,0,"75, winner_runs",2012/03/26
562,1225248,England,West Indies,West Indies,field,England,West Indies,England,0,"113, winner_runs",2020/07/16
563,1187686,New Zealand,India,New Zealand,field,India,New Zealand,New Zealand,0,"7, winner_wickets",2020/02/29
564,657647,,,Pakistan,field,Sri Lanka,Pakistan,draw,0,,2013/12/31


Form two new columns of df_new: Bat_first_players and Bowl_first_players

#### Currently is long winded. There is probably a more efficient way to do this...

In [12]:
df_temp_bat = df_all.groupby(['Match_id','Batting_team','Innings'])['Batter'].unique().reset_index()
df_temp_bowl = df_all.groupby(['Match_id','Batting_team','Innings'])['Bowler'].unique().reset_index()
df_temp = df_temp_bat.merge(df_temp_bowl, how='inner', on=['Match_id','Batting_team','Innings'])

df_temp['Bat_first'] = df_temp[df_temp['Innings']=='1']['Batting_team']
df_temp.sort_values(by=['Match_id','Bat_first'],inplace=True)
df_temp.fillna(method='ffill',inplace=True)

def func_to_try(x):
    x=np.array(x)
    if len(x)==2:
        s = list(set(x[0].tolist()+x[1].tolist()))
    elif len(x)==1:
        s = list(set(x[0].tolist()))
    return s
        

df_temp = df_temp.groupby(['Match_id','Batting_team','Bat_first']).agg({'Batter': func_to_try, \
                                                  'Bowler': func_to_try}).reset_index()


df_temp['Bowler_switch'] = df_temp['Bowler'].copy()
df_temp['Bowler_switch'].iloc[0::2]=df_temp['Bowler_switch'].iloc[1::2].values
df_temp['Bowler_switch'].iloc[1::2]=df_temp['Bowler'].iloc[0::2].values

df_temp['players'] = [list(set(df_temp['Batter'][i] + df_temp['Bowler_switch'][i]))\
 for i in range(len(df_temp['Batter'].values.tolist()))]

df_temp.loc[df_temp['Batting_team']==df_temp['Bat_first'],'Bat_first_players']=df_temp['players']
df_temp.loc[df_temp['Batting_team']!=df_temp['Bat_first'],'Bowl_first_players']=df_temp['players']

df_new_b= pd.DataFrame(columns = ['Match_id','Bat_first_players','Bowl_first_players'])

df_new_b['Match_id']=df_temp['Match_id'].unique().copy()
df_new_b['Bat_first_players']=df_temp['Bat_first_players'].dropna().reset_index(drop=True).values.copy()
df_new_b['Bowl_first_players']=df_temp['Bowl_first_players'].dropna().reset_index(drop=True).values.copy()

df_new = df_new.merge(df_new_b, how='inner', on='Match_id')


In [14]:
del df_temp 
del df_temp_bat 
del df_temp_bowl 
del df_new_b

Merge df_new and df_new_a dataframes using common, unique key of 'Match_id'

In [15]:
df_new = df_new.merge(df_new_a, how='inner', on='Match_id')

Make sure Match_id is an integer

In [16]:
df_new = df_new.astype({'Match_id': int})

Count how many matches have 12 or more Bat_first_players or Bowl_first_players listed. These will be removed from the set for modelling purposes as it is not a usual occurrence. Concussion subs are one possible reason for this.

In [17]:
count_over = df_new[(df_new['Bat_first_players'].str.len()>=12)]['Match_id'].count()
print(f'{count_over} out of {len(df_new)}')
count_over = df_new[(df_new['Bowl_first_players'].str.len()>=12)]['Match_id'].count()
print(f'{count_over} out of {len(df_new)}')

3 out of 566
3 out of 566


In [18]:
matches_to_remove = df_new[(df_new['Bat_first_players'].str.len()>=12)|(df_new['Bowl_first_players'].str.len()>=12)]['Match_id'].values.tolist()
matches_to_remove

df_new = df_new[~df_new['Match_id'].isin(matches_to_remove)]
df_new

Unnamed: 0,Match_id,Home_team,Away_team,Toss_winner,Toss_decision,Bat_first,Field_first,Outcome,By_innings,Margin,...,Innings2_runs,Innings2_wickets,Innings3_runs,Innings3_wickets,Innings4_runs,Innings4_wickets,Innings1_overs,Innings2_overs,Innings3_overs,Innings4_overs
0,522245,Zimbabwe,Bangladesh,Bangladesh,field,Zimbabwe,Bangladesh,Zimbabwe,0,"130, winner_runs",...,287.0,10.0,291.0,5.0,244.0,10.0,130.6,96.2,91.6,57.3
1,298794,South Africa,New Zealand,New Zealand,bat,New Zealand,South Africa,South Africa,1,"59, winner_runs",...,383.0,10.0,136.0,9.0,,,56.4,97.3,34.3,
2,913613,England,Sri Lanka,Sri Lanka,field,England,Sri Lanka,England,1,"88, winner_runs",...,91.0,10.0,119.0,10.0,,,90.3,36.4,35.3,
3,1144165,South Africa,Sri Lanka,South Africa,bat,South Africa,Sri Lanka,Sri Lanka,0,"8, winner_wickets",...,154.0,9.0,128.0,10.0,197.0,2.0,61.2,37.4,44.3,45.4
4,456671,Sri Lanka,India,Sri Lanka,bat,Sri Lanka,India,India,0,"5, winner_wickets",...,436.0,10.0,267.0,10.0,258.0,5.0,137.6,106.1,85.2,68.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561,521225,Sri Lanka,England,Sri Lanka,bat,Sri Lanka,England,Sri Lanka,0,"75, winner_runs",...,193.0,10.0,214.0,10.0,264.0,10.0,96.3,46.4,84.3,98.6
562,1225248,England,West Indies,West Indies,field,England,West Indies,England,0,"113, winner_runs",...,287.0,10.0,129.0,3.0,198.0,10.0,161.6,98.6,18.7,70.1
563,1187686,New Zealand,India,New Zealand,field,India,New Zealand,New Zealand,0,"7, winner_wickets",...,235.0,10.0,124.0,10.0,132.0,3.0,62.6,73.1,45.6,35.7
564,657647,,,Pakistan,field,Sri Lanka,Pakistan,draw,0,,...,383.0,10.0,480.0,5.0,158.0,2.0,64.6,129.1,168.3,51.6


Split each of the 'Bat_first_players' and 'Bowl_first_players' lists so each player has their own column.

In [19]:
df_new[['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11']] \
= pd.DataFrame(df_new['Bat_first_players'].tolist(), index= df_new.index)

df_new[['p12','p13','p14','p15','p16','p17','p18','p19','p20','p21','p22']] \
= pd.DataFrame(df_new['Bowl_first_players'].tolist(), index= df_new.index)



Reset the index

In [20]:
df_new.reset_index(inplace=True)

Because of the way the data has been extracted from the cricsheet files, if a player didn't bat or bowl in the match then they won't be listed. Find which cases are missing and manually impute the data using cricinfo. First, split the strings in the list:

Print list of match ids to impute. This will help find the relevant info on cricinfo

In [21]:
matches_to_impute = df_new[(df_new['Bat_first_players'].str.len()<11) | (df_new['Bowl_first_players'].str.len()<11)]['Match_id'].values
print(matches_to_impute)

[ 463146  892509   64123  298803  534225  249193 1109602  474463  514034
  291338  258460  530424  518951 1183531  210366  293480 1119550 1187008
  742611 1233957  892517  892515]


Manually enter the missing players for these matches

In [22]:
### 463146
df_new.loc[df_new['Match_id']==463146,'p21'] = 'AG Prince'
df_new.loc[df_new['Match_id']==463146,'p22'] = 'MV Boucher'

### 892509
df_new.loc[df_new['Match_id']==892509,'p11'] = 'PM Nevill'

### 64123
df_new.loc[df_new['Match_id']==64123,'p21'] = 'AG Prince'
df_new.loc[df_new['Match_id']==64123,'p22'] = 'MV Boucher'

### 298803
df_new.loc[df_new['Match_id']==298803,'p22'] = 'MV Boucher'

### 534225
df_new.loc[df_new['Match_id']==534225,'p21'] = 'AB de Villiers'
df_new.loc[df_new['Match_id']==534225,'p22'] = 'JA Rudolph'

### 249193
df_new.loc[df_new['Match_id']==249193,'p22'] = 'HAPW Jayawardene'

### 1109602
df_new.loc[df_new['Match_id']==1109602,'p22'] = 'DAS Gunaratne'

### 474463
df_new.loc[df_new['Match_id']==474463,'p22'] = 'MJ Prior'

### 514034
df_new.loc[df_new['Match_id']==514034,'p11'] = 'MV Boucher'

### 291338
df_new.loc[df_new['Match_id']==291338,'p11'] = 'AC Gilchrist'

### 258460
df_new.loc[df_new['Match_id']==258460,'p22'] = 'RR Sarwan'

### 530424
df_new.loc[df_new['Match_id']==530424,'p22'] = 'A Akmal'

### 518951
df_new.loc[df_new['Match_id']==518951,'p22'] = 'BJ Haddin'

### 1183531
df_new.loc[df_new['Match_id']==1183531,'p10'] = 'TM Head'
df_new.loc[df_new['Match_id']==1183531,'p11'] = 'TD Paine'

### 210366
df_new.loc[df_new['Match_id']==210366,'p22'] = 'GO Jones'

### 293480
df_new.loc[df_new['Match_id']==293480,'p22'] = 'HAPW Jayawardene'

### 1119550
df_new.loc[df_new['Match_id']==1119550,'p22'] = 'AU Rashid'

### 1187008
df_new.loc[df_new['Match_id']==1187008,'p11'] = 'WP Saha'

### 742611
df_new.loc[df_new['Match_id']==742611,'p11'] = 'A Shafiq'

### 1233957
df_new.loc[df_new['Match_id']==1233957,'p22'] = 'SO Dowrich'

### 892517
df_new.loc[df_new['Match_id']==892517,'p11'] = 'PM Nevill'

### 892515
df_new.loc[df_new['Match_id']==892515,'p11'] = 'PM Nevill'

Check if all missing players have been filled out

In [23]:
sw = 0
for ap in ['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11',\
    'p12','p13','p14','p15','p16','p17','p18','p19','p20','p21','p22']:
     if df_new[ap].isnull().sum() != 0:
        print(f'{ap} has missing players')
        sw = 1
if sw==0:
    print(f'No missing players')

No missing players


Drop Bat_first_players and Bowl_first_players columns now as these are incomplete and can be remade later from the individual player columns if necessary

In [24]:
df_new.drop(columns=['Bat_first_players','Bowl_first_players'],inplace=True)

Import data for ICC test batting and bowling rankings in order to add player ratings to dataframe

In [25]:
df_bat = pd.read_csv('./ICC_rankings/ICCtestbattingrankings.csv')
df_bowl = pd.read_csv('./ICC_rankings/ICCtestbowlingrankings.csv')

In [26]:
for df_i in [df_bat, df_bowl]:
    df_i[['Initial','Surname']]=df_i['Name'].str.rsplit(n=1,expand=True)
    df_i['Surname'].fillna(df_i['Name'],inplace=True)
    df_i['Date'] = pd.to_datetime(df_i['Date'])

For each player, get their rating at start of each test match. Give a default rating of 200 batting and 50 bowling to those not in the top 100

#### These default ratings are fairly arbitrary at the moment. May want to amend this... Also this could be more efficient

In [None]:
for ap in ['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11',\
        'p12','p13','p14','p15','p16','p17','p18','p19','p20','p21','p22']:
    df_new[ap+'_bat_rating'] = 200 #### give default rating to those not in top 100  
    df_new[ap+'_bowl_rating'] = 50 #### give default rating to those not in top 100 
    
for df_i in [df_bat, df_bowl]:
    if df_i.equals(df_bat):
        discipline='_bat'
    elif df_i.equals(df_bowl):
        discipline='_bowl'
    else:
        print('something has gone wrong?')
        break
        
    counter=0
    for matchid in df_new['Match_id']:
        date = df_new[df_new['Match_id']==matchid]['Start_date'].values[0]
        counter+=1
        print('##############################',counter, date)
        top_players = df_i[df_i['Date']==date]['Surname'].values.tolist()
        for ap in ['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11',\
            'p12','p13','p14','p15','p16','p17','p18','p19','p20','p21','p22']:
            for player in df_new.loc[df_new['Match_id']==matchid, ap]:
                if isinstance(player, str): ### not nan (float)    
                    #print(player,player.split()[-1])
                    if  player.split()[-1] in top_players:
                        #print(f'{player} IS in top 100 {discipline} on {date}')
                        df_new.loc[df_new['Match_id']==matchid, ap+discipline+'_rating'] = df_i.loc[(df_i['Date']==date) & (df_i['Surname']==player.split()[-1]),'Rating'].values[0] 
                    else:
                        #print(f'{player} not in top 100 {discipline} on {date}')
                        pass

############################## 1 2011/08/04
############################## 2 2007/11/16
############################## 3 2016/05/19
############################## 4 2019/02/21
############################## 5 2010/08/03
############################## 6 2009/03/06
############################## 7 2013/11/06
############################## 8 2012/01/25
############################## 9 2018/12/06
############################## 10 2010/12/26
############################## 11 2005/01/02
############################## 12 2020/12/26
############################## 13 2005/03/11
############################## 14 2012/11/25
############################## 15 2016/02/12
############################## 16 2012/11/17
############################## 17 2008/10/09
############################## 18 2019/08/14
############################## 19 2010/12/16
############################## 20 2007/05/18
############################## 21 2012/01/17
############################## 22 2009/07/17
###################

############################## 182 2015/08/20
############################## 183 2009/01/03
############################## 184 2007/10/01
############################## 185 2015/07/03
############################## 186 2014/12/26
############################## 187 2019/10/02
############################## 188 2015/10/22
############################## 189 2011/07/21
############################## 190 2017/09/28
############################## 191 2013/10/09
############################## 192 2017/07/26
############################## 193 2014/06/08
############################## 194 2006/12/26
############################## 195 2017/07/14
############################## 196 2009/12/04
############################## 197 2017/09/04
############################## 198 2017/12/14
############################## 199 2008/04/11
############################## 200 2016/11/17
############################## 201 2009/04/03
############################## 202 2007/12/08
############################## 203

############################## 361 2019/02/01
############################## 362 2010/10/01
############################## 363 2010/11/12
############################## 364 2009/11/26
############################## 365 2011/12/17
############################## 366 2016/07/22
############################## 367 2014/11/17
############################## 368 2013/09/03
############################## 369 2013/02/22
############################## 370 2005/11/12
############################## 371 2014/08/09
############################## 372 2018/03/30
############################## 373 2009/08/18
############################## 374 2010/01/03
############################## 375 2011/09/01
############################## 376 2013/12/26
############################## 377 2014/02/12
############################## 378 2010/01/14
############################## 379 2009/08/26
############################## 380 2014/02/14
############################## 381 2015/01/02
############################## 382

############################## 540 2013/12/13
############################## 541 2008/10/29
############################## 542 2008/01/02
############################## 543 2013/12/26
############################## 544 2018/03/01
############################## 545 2008/03/22
############################## 546 2016/11/06
############################## 547 2010/06/04
############################## 548 2010/08/26
############################## 549 2012/02/03
############################## 550 2009/05/14
############################## 551 2016/11/03
############################## 552 2016/08/04
############################## 553 2012/07/08
############################## 554 2019/01/03
############################## 555 2018/05/24
############################## 556 2012/03/26
############################## 557 2020/07/16
############################## 558 2020/02/29
############################## 559 2013/12/31
############################## 560 2013/01/03
############################## 1 2

############################## 161 2008/04/03
############################## 162 2017/08/27
############################## 163 2017/12/02
############################## 164 2012/11/21
############################## 165 2008/05/23
############################## 166 2009/12/16
############################## 167 2017/07/14
############################## 168 2013/08/21
############################## 169 2006/12/14
############################## 170 2013/05/24
############################## 171 2017/08/03
############################## 172 2017/10/06
############################## 173 2011/07/29
############################## 174 2011/11/01
############################## 175 2012/01/26
############################## 176 2006/01/29
############################## 177 2007/10/08
############################## 178 2008/12/26
############################## 179 2013/03/22
############################## 180 2005/11/20
############################## 181 2013/03/14
############################## 182

############################## 340 2012/01/13
############################## 341 2016/10/30
############################## 342 2012/01/24
############################## 343 2010/02/14
############################## 344 2011/12/01
############################## 345 2009/07/12
############################## 346 2014/09/05
############################## 347 2018/05/11
############################## 348 2014/06/26
############################## 349 2019/12/12
############################## 350 2008/07/23
############################## 351 2009/12/26
############################## 352 2012/03/15
############################## 353 2018/08/09
############################## 354 2006/07/13
############################## 355 2016/08/11
############################## 356 2018/01/13
############################## 357 2008/11/19
############################## 358 2007/06/25
############################## 359 2014/06/20
############################## 360 2011/11/22
############################## 361

In [None]:
df_new.to_csv('test_match_summary_with_player_rankings.csv',index=False)