In [1]:
import requests
import os
import yaml
import zipfile
import pandas as pd
import datetime
import numpy as np
import pickle
from os import path
import csv

In [2]:
CRICSHEET_URL = 'https://cricsheet.org/downloads/ipl.zip'
TARGET_PATH = os.path.abspath(os.getcwd()) + '//data//ipl.zip'
MATCH_CSV_PATH = os.path.abspath(os.getcwd()) + '//data//match_data.csv'
DELIVERIES_CSV_PATH = os.path.abspath(os.getcwd()) + '//data//deliveries_data.csv'
PROCESSED_FILES_LOG = os.path.abspath(os.getcwd()) + '//data//processed.pkl'
CHUNK_SIZE = 128

In [3]:
def download_data():    
    """
    Description
    -----------
    Downloads data from Cricsheet website and saves in 
    the current working directory
    """
    r = requests.get(CRICSHEET_URL, stream=True)
    with open(TARGET_PATH, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
            fd.write(chunk)
            


In [37]:
def initialize():    
    """
    Description
    -----------
    Initalizes the python script to download and update 
    latest IPL match information.
    
    global variables
    ----------------
    ipl_zip    : Reference to downloaded IPL Data ZIP file
    match_list : List of files within ipl_zip, where each file 
        contains data of one IPL match 
    files_processed : Set Object, that contains list of all the 
        files that are already processed.
    matches_df : Pandas Dataframe, it creates and updates IPL Match 
        overview in MATCH_CSV_PATH
    match_id   : Unique ID that corresponds to each match record
    deliveries_df : Pandas Dataframe, creates and updates ball 
        by ball information of an IPL Match. 
    """
    global ipl_zip, match_list, files_processed, matches_df, match_id, deliveries_df
    download_data()
    ipl_zip = zipfile.ZipFile(TARGET_PATH)
    match_list = ipl_zip.namelist()
    print("Total Match Records: ", len(match_list))

    # Load Files Processed List
    if(path.isfile(PROCESSED_FILES_LOG) == False):
        with open(PROCESSED_FILES_LOG, "wb") as file_handle:
            pickle.dump({"README.txt"}, file_handle)

    with open(PROCESSED_FILES_LOG, "rb") as file_handle:
        files_processed = pickle.load(file_handle)
    
    # Load Match File Data
    matches_columns = ['dates','city','season','host_team','visiting_team', 'toss_winner','toss_decision','venue','result','winner','player_of_match']
    if(path.isfile(MATCH_CSV_PATH) == False):
        matches_df = pd.DataFrame(columns=matches_columns)
        match_id = int(0)
    else:
        matches_df = pd.read_csv(MATCH_CSV_PATH)
        match_id = int(matches_df['match_id'].max())
        
    # Load Deliveries File data
    deliveries_columns = ['batting_team','bowling_team','batsman','bowler','over','ball','non_striker','total_runs','batsman_runs','extras_runs']
    if(path.isfile(MATCH_CSV_PATH) == False):
        deliveries_df = pd.DataFrame(columns=deliveries_columns)
    else:
        deliveries_df = pd.read_csv(DELIVERIES_CSV_PATH)
    

In [5]:
def save_processed_files():
    """
    Description:
    ------------
    This method is called at the end after processing the data to
    store the newly created or updated records to memory.
    """
    with open(PROCESSED_FILES_LOG, "wb") as file_handle:
        pickle.dump(files_processed, file_handle)
    matches_df.to_csv (MATCH_CSV_PATH, index = False, header=True)
    deliveries_df.to_csv (DELIVERIES_CSV_PATH, index = False, header=True)

In [6]:
def get_data(obj, key):
    """
    Returns np.nan if the requested value is not 
    present withiin the object
    """
    try:
        if isinstance(key, list):
            for each in key:
                obj = obj[each]
            return obj
        else:
            if key in obj:
                return obj[key]
    except:
        print('ERROR: key: ',key, ' | obj: ', obj)
        return np.nan

In [7]:
def get_match_data(match_id, info):
    """
    Extracts information from the info object and returns 
    structured information about the match.
    
    Returns
    -------
    A dictionary with values for the following keys
    
    Keys:
    --------
    match_id : int, Unique ID associated with each IPL match
    dates : DateTime, Date of the match
    city : String, City where the match is played
    season : int, year the match is played
    host_team : str, name of the host team
    visiting_team : str, name of the visiting team
    toss_winner : str, name of the team which won the toss
    toss_decision : str, either 'bat' or 'field'
    venue : str, name of the cricket ground
    result : int, 0 - No Result, 1 - One team won, 2 - Tie
    winner : str, Name of the team which won
        In case of No result, 'no result'
        In case of Tie, Team which won the eliminator
    player_of_match : str, Name of player who won the 
        Man of the match award
    """
    
    data = {}

    data['match_id'] = int(match_id)

    data['dates'] = get_data(info, ['dates', 0])
    if(isinstance(data['dates'], datetime.date) != True):
        data['dates'] = datetime.datetime.strptime(data['dates'], '%Y-%m-%d')

    data['city'] = get_data(info,'city')
    data['season'] = data['dates'].year
    data['host_team'] = get_data(info,['teams', 0])
    data['visiting_team'] = get_data(info, ['teams', 1])
    data['toss_winner'] = get_data(info, ['toss', 'winner'])
    data['toss_decision'] = get_data(info, ['toss', 'decision'])
    data['venue'] = get_data(info, 'venue')
    if 'winner' in info['outcome']:
        data['winner'] = get_data(info, ['outcome','winner'])
        # Result : 0 - no result, 1 - winner, 2 - tie
        data['result'] = 1
        if(data['winner'] == 'no result'):
            data['result'] = 0

    elif 'eliminator' in info['outcome']:
            data['result'] = 2
            data['winner'] = get_data(info, ['outcome', 'eliminator'])
    else:
        data['winner'] = 'no result'
        data['result'] = 0

    if 'player_of_match' in info:
        data['player_of_match'] = get_data(info, ['player_of_match', 0])
    
    return data

In [8]:
def get_delivery_data(match_id, reader, batting_team, bowling_team):
    """
    Extract ball by ball information about a match and return a 
    detailed structured information about the match.
    
    Returns:
    --------
    A dictionary filled with values for following keys
    
    Keys:
    -----
    match_id : int, Unique ID associated with each IPL match
    batting_team : str, Name of the batting team
    bowling_team: str, Name of the bowling team
    over : int, Over associated with the record
    ball : int, ball number associated with the record
    batsman : str, Name of Batsman in strike
    bowler : str, Name of Bowler bowling the over
    non_striker : str, Name of the Non Striker Batsman
    total_runs : int, Total runs scored including extras for 
        the delivery
    batsman_runs : int, Runs scored by batsman
    extras_runs : int, Runs as extras for the given delivery
    wicket : int, either 0 or 1
        0 - No wicket has fallen in the given delivery
        1 - Wicket has fallen for the given delivery
    wicket_kind: Possible values: 'lbw', 'caught', 'bowled', 
        'run out', 'retired hurt', 'stumped', 'hit wicket',  
        'caught and bowled', 'obstructing the field'
    wicket_fielders : List, list of fielders involved
    player_out : str, Name of the player that got out
    """
    
    data = {}
    delivery = list(reader.keys())[0]
    delivery_obj = get_data(reader, delivery)
    data['match_id'] = match_id
    data['batting_team'] = batting_team
    data['bowling_team'] = bowling_team
    data['over'], data['ball'] = str(delivery).split('.')
    data['over'],data['ball'] = int(data['over']),int(data['ball'])
    data['batsman'] = get_data(delivery_obj, 'batsman')
    data['bowler'] = get_data(delivery_obj, 'bowler')
    data['non_striker'] = get_data(delivery_obj, 'non_striker')
    data['total_runs'] = get_data(delivery_obj, ['runs', 'total'])
    data['batsman_runs'] = get_data(delivery_obj, ['runs', 'batsman'])
    data['extras_runs'] = get_data(delivery_obj, ['runs', 'extras'])
    
    if 'wicket' in delivery_obj:
        data['wicket'] = 1
        data['wicket_kind'] = get_data(delivery_obj, ['wicket', 'kind'])
        data['wicket_player_out'] = get_data(delivery_obj, ['wicket', 'player_out'])
        if 'fielders' in delivery_obj['wicket']:
            data['wicket_fielders'] = get_data(delivery_obj, ['wicket', 'fielders'])
    else:
        data['wicket'] = 0
    return data
    
    
def get_deliveries_data(match_id, reader): 
    collection = []
    
    innings = reader['innings']
    first_innings = get_data(innings, [0, '1st innings'])
    second_innings = get_data(innings, [1, '2nd innings'])
    
    teams = get_data(reader, ['info', 'teams'])
    if((get_data(reader, ['info', 'toss', 'winner']) == teams[0] and get_data(reader, ['info', 'toss', 'decision']) == 'bat') or 
       (get_data(reader, ['info', 'toss', 'winner']) == teams[1] and get_data(reader, ['info', 'toss', 'decision']) == 'field')):
        team1, team2 = teams[0], teams[1]
    else:
        team1, team2 = teams[1], teams[0]
    
    if isinstance (first_innings, dict):
        for each_delivery in get_data(first_innings, 'deliveries'):
            collection.append(get_delivery_data(match_id, each_delivery, team1, team2))
    if isinstance(second_innings, dict):
        for each_delivery in get_data(second_innings, 'deliveries'):
            collection.append(get_delivery_data(match_id, each_delivery, team2, team1))
        
    return collection

In [9]:
initialize()

Total Match Records:  773


In [10]:
for match in match_list:
    if(match in files_processed):
        continue

    match_id = int(match_id + 1)

    with ipl_zip.open(match) as yamlfile:
        reader = yaml.safe_load(yamlfile)

        print(match_id,". Processing File : ", match)
        info = reader['info']
        
        match_data = get_match_data(match_id, info)
        matches_df = matches_df.append([match_data])
        
        deliveries_data = get_deliveries_data(match_id, reader)
        deliveries_df = deliveries_df.append(deliveries_data)
        
        files_processed.add(match)

save_processed_files()

771 . Processing File :  1216514.yaml
772 . Processing File :  1216515.yaml


### Note: Following code blocks are executed to validate the data

In [36]:
matches_df.tail()

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
767,,2020-09-28 00:00:00,Royal Challengers Bangalore,768.0,AB de Villiers,2,2020,field,Mumbai Indians,Dubai International Cricket Stadium,Mumbai Indians,Royal Challengers Bangalore
768,Abu Dhabi,2020-10-01 00:00:00,Mumbai Indians,769.0,KA Pollard,1,2020,field,Kings XI Punjab,Sheikh Zayed Stadium,Kings XI Punjab,Mumbai Indians
769,,2020-10-02 00:00:00,Sunrisers Hyderabad,770.0,PK Garg,1,2020,bat,Sunrisers Hyderabad,Dubai International Cricket Stadium,Chennai Super Kings,Sunrisers Hyderabad
0,Abu Dhabi,2020-10-03 00:00:00,Rajasthan Royals,771.0,YS Chahal,1,2020,bat,Rajasthan Royals,Sheikh Zayed Stadium,Royal Challengers Bangalore,Royal Challengers Bangalore
0,,2020-10-03 00:00:00,Delhi Capitals,772.0,SS Iyer,1,2020,field,Kolkata Knight Riders,Sharjah Cricket Stadium,Kolkata Knight Riders,Delhi Capitals


In [12]:
# deliveries_df = deliveries_df.set_index(['match_id','batting_team','bowling_team','over','ball'])
deliveries_df

Unnamed: 0,ball,batsman,batsman_runs,batting_team,bowler,bowling_team,extras_runs,match_id,non_striker,over,total_runs,wicket,wicket_fielders,wicket_kind,wicket_player_out
0,1,SC Ganguly,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,1,1.0,BB McCullum,0,1,0.0,,,
1,2,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
2,3,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,1,1.0,SC Ganguly,0,1,0.0,,,
3,4,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
4,5,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,2,RA Tripathi,0,Kolkata Knight Riders,MP Stoinis,Delhi Capitals,0,772.0,KL Nagarkoti,19,0,1.0,,bowled,RA Tripathi
245,3,Shivam Mavi,1,Kolkata Knight Riders,MP Stoinis,Delhi Capitals,0,772.0,KL Nagarkoti,19,1,0.0,,,
246,4,KL Nagarkoti,1,Kolkata Knight Riders,MP Stoinis,Delhi Capitals,0,772.0,Shivam Mavi,19,1,0.0,,,
247,5,Shivam Mavi,0,Kolkata Knight Riders,MP Stoinis,Delhi Capitals,0,772.0,KL Nagarkoti,19,0,0.0,,,


In [13]:
matches_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 772 entries, 0 to 0
Data columns (total 12 columns):
city               749 non-null object
dates              772 non-null object
host_team          772 non-null object
match_id           772 non-null float64
player_of_match    768 non-null object
result             772 non-null int64
season             772 non-null int64
toss_decision      772 non-null object
toss_winner        772 non-null object
venue              772 non-null object
visiting_team      772 non-null object
winner             772 non-null object
dtypes: float64(1), int64(2), object(9)
memory usage: 78.4+ KB


In [14]:
deliveries_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182902 entries, 0 to 248
Data columns (total 15 columns):
ball                 182902 non-null int64
batsman              182902 non-null object
batsman_runs         182902 non-null int64
batting_team         182902 non-null object
bowler               182902 non-null object
bowling_team         182902 non-null object
extras_runs          182902 non-null int64
match_id             182902 non-null float64
non_striker          182902 non-null object
over                 182902 non-null int64
total_runs           182902 non-null int64
wicket               182902 non-null float64
wicket_fielders      6449 non-null object
wicket_kind          9013 non-null object
wicket_player_out    9013 non-null object
dtypes: float64(2), int64(5), object(8)
memory usage: 22.3+ MB


In [15]:
matches_df.describe()

Unnamed: 0,match_id,result,season
count,772.0,772.0,772.0
mean,386.5,1.009067,2013.580311
std,223.001495,0.139187,3.460359
min,1.0,0.0,2008.0
25%,193.75,1.0,2011.0
50%,386.5,1.0,2013.0
75%,579.25,1.0,2017.0
max,772.0,2.0,2020.0


In [31]:
deliveries_df.describe()

Unnamed: 0,match_id,wicket
count,932.0,932.0
mean,2.508584,0.049356
std,1.117641,0.216727
min,1.0,0.0
25%,2.0,0.0
50%,2.0,0.0
75%,4.0,0.0
max,4.0,1.0


In [16]:
matches_df[matches_df.result == 2]

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
66,Cape Town,2009-04-23,Kolkata Knight Riders,67.0,YK Pathan,2,2009,field,Kolkata Knight Riders,Newlands,Rajasthan Royals,Rajasthan Royals
130,Chennai,2010-03-21,Chennai Super Kings,131.0,J Theron,2,2010,field,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",Kings XI Punjab,Kings XI Punjab
327,Hyderabad,2013-04-07,Sunrisers Hyderabad,328.0,GH Vihari,2,2013,bat,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,Sunrisers Hyderabad
342,Bangalore,2013-04-16,Royal Challengers Bangalore,343.0,V Kohli,2,2013,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Delhi Daredevils,Royal Challengers Bangalore
416,Abu Dhabi,2014-04-29,Kolkata Knight Riders,417.0,JP Faulkner,2,2014,bat,Rajasthan Royals,Sheikh Zayed Stadium,Rajasthan Royals,Rajasthan Royals
462,Ahmedabad,2015-04-21,Rajasthan Royals,463.0,SE Marsh,2,2015,field,Kings XI Punjab,"Sardar Patel Stadium, Motera",Kings XI Punjab,Kings XI Punjab
610,Rajkot,2017-04-29,Gujarat Lions,611.0,KH Pandya,2,2017,bat,Gujarat Lions,Saurashtra Cricket Association Stadium,Mumbai Indians,Mumbai Indians
645,Delhi,2019-03-30 00:00:00,Delhi Capitals,646.0,PP Shaw,2,2019,field,Delhi Capitals,Feroz Shah Kotla,Kolkata Knight Riders,Delhi Capitals
746,Mumbai,2019-05-02 00:00:00,Mumbai Indians,747.0,JJ Bumrah,2,2019,bat,Mumbai Indians,Wankhede Stadium,Sunrisers Hyderabad,Mumbai Indians
757,,2020-09-20 00:00:00,Delhi Capitals,758.0,MP Stoinis,2,2020,field,Kings XI Punjab,Dubai International Cricket Stadium,Kings XI Punjab,Delhi Capitals


In [14]:
matches_df['dates'] = pd.to_datetime(matches_df['dates'])

In [15]:
matches_df[matches_df['dates'].dt.year == 2020]

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
756,Abu Dhabi,2020-09-19,Mumbai Indians,757.0,AT Rayudu,1,2020,field,Chennai Super Kings,Sheikh Zayed Stadium,Chennai Super Kings,Chennai Super Kings
757,,2020-09-20,Delhi Capitals,758.0,MP Stoinis,2,2020,field,Kings XI Punjab,Dubai International Cricket Stadium,Kings XI Punjab,Delhi Capitals
758,,2020-09-21,Royal Challengers Bangalore,759.0,YS Chahal,1,2020,field,Sunrisers Hyderabad,Dubai International Cricket Stadium,Sunrisers Hyderabad,Royal Challengers Bangalore
759,,2020-09-22,Rajasthan Royals,760.0,SV Samson,1,2020,field,Chennai Super Kings,Sharjah Cricket Stadium,Chennai Super Kings,Rajasthan Royals
760,Abu Dhabi,2020-09-23,Mumbai Indians,761.0,RG Sharma,1,2020,field,Kolkata Knight Riders,Sheikh Zayed Stadium,Kolkata Knight Riders,Mumbai Indians
761,,2020-09-25,Delhi Capitals,762.0,PP Shaw,1,2020,field,Chennai Super Kings,Dubai International Cricket Stadium,Chennai Super Kings,Delhi Capitals
762,,2020-09-24,Kings XI Punjab,763.0,KL Rahul,1,2020,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,Royal Challengers Bangalore,Kings XI Punjab
763,Abu Dhabi,2020-09-26,Sunrisers Hyderabad,764.0,Shubman Gill,1,2020,bat,Sunrisers Hyderabad,Sheikh Zayed Stadium,Kolkata Knight Riders,Kolkata Knight Riders
764,,2020-09-27,Kings XI Punjab,765.0,SV Samson,1,2020,field,Rajasthan Royals,Sharjah Cricket Stadium,Rajasthan Royals,Rajasthan Royals
765,,2020-09-30,Kolkata Knight Riders,766.0,Shivam Mavi,1,2020,field,Rajasthan Royals,Dubai International Cricket Stadium,Rajasthan Royals,Kolkata Knight Riders


In [85]:
matches_df.to_csv (MATCH_CSV_PATH, index = False, header=True)
deliveries_df.to_csv (DELIVERIES_CSV_PATH, index = False, header=True)

In [38]:
matches_df[matches_df.isna().any(axis=1)]

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
0,Delhi,2011-05-21,Delhi Daredevils,242.0,,0,2011,bat,Delhi Daredevils,Feroz Shah Kotla,Pune Warriors,no result
0,,2014-04-17,Delhi Daredevils,400.0,YS Chahal,1,2014,field,Royal Challengers Bangalore,Sharjah Cricket Stadium,Royal Challengers Bangalore,Royal Challengers Bangalore
0,,2014-04-19,Kolkata Knight Riders,403.0,JP Duminy,1,2014,bat,Kolkata Knight Riders,Dubai International Cricket Stadium,Delhi Daredevils,Delhi Daredevils
0,,2014-04-19,Royal Challengers Bangalore,404.0,PA Patel,1,2014,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,Mumbai Indians,Royal Challengers Bangalore
0,,2014-04-20,Rajasthan Royals,405.0,GJ Maxwell,1,2014,field,Kings XI Punjab,Sharjah Cricket Stadium,Kings XI Punjab,Kings XI Punjab
0,,2014-04-22,Kings XI Punjab,407.0,GJ Maxwell,1,2014,field,Sunrisers Hyderabad,Sharjah Cricket Stadium,Sunrisers Hyderabad,Kings XI Punjab
0,,2014-04-23,Rajasthan Royals,408.0,RA Jadeja,1,2014,field,Rajasthan Royals,Dubai International Cricket Stadium,Chennai Super Kings,Chennai Super Kings
0,,2014-04-24,Royal Challengers Bangalore,409.0,CA Lynn,1,2014,field,Royal Challengers Bangalore,Sharjah Cricket Stadium,Kolkata Knight Riders,Kolkata Knight Riders
0,,2014-04-25,Chennai Super Kings,410.0,MM Sharma,1,2014,bat,Mumbai Indians,Dubai International Cricket Stadium,Mumbai Indians,Chennai Super Kings
0,,2014-04-25,Sunrisers Hyderabad,411.0,AJ Finch,1,2014,bat,Sunrisers Hyderabad,Dubai International Cricket Stadium,Delhi Daredevils,Sunrisers Hyderabad


array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'AC Gilchrist',
       'Y Venugopal Rao', 'VVS Laxman', 'A Symonds', 'RG Sharma',
       'SB Styris', 'AS Yadav', 'SB Bangar', 'WPUJC Vaas', 'RP Singh',
       'WP Saha', 'LR Shukla', 'L Ronchi', 'ST Jayasuriya', 'DJ Thornely',
       'RV Uthappa', 'PR Shah', 'AM Nayar', 'SM Pollock',
       'Harbhajan Singh', 'S Chanderpaul', 'LRPL Taylor',
       'DPMD Jayawardene', 'S Sohal', 'B Lee', 'PP Cha

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,batsman,batsman_runs,bowler,extras_runs,non_striker,total_runs
match_id,batting_team,bowling_team,over,ball,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,0,P Kumar,1,BB McCullum,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,0,P Kumar,1,SC Ganguly,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,6,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,7,BB McCullum,0,P Kumar,1,SC Ganguly,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,BB McCullum,0,Z Khan,0,SC Ganguly,0


In [40]:
deliveries_df[deliveries_df['wicket'] == 1]

Unnamed: 0,ball,batsman,batsman_runs,batting_team,bowler,bowling_team,extras_runs,match_id,non_striker,over,total_runs,wicket,wicket_fielders,wicket_kind,wicket_player_out
33,2,SC Ganguly,0,Kolkata Knight Riders,Z Khan,Royal Challengers Bangalore,0,1.0,BB McCullum,5,0,1.0,[JH Kallis],caught,SC Ganguly
74,1,RT Ponting,0,Kolkata Knight Riders,JH Kallis,Royal Challengers Bangalore,0,1.0,BB McCullum,12,0,1.0,[P Kumar],caught,RT Ponting
106,1,DJ Hussey,0,Kolkata Knight Riders,AA Noffke,Royal Challengers Bangalore,0,1.0,BB McCullum,17,0,1.0,[CL White],caught,DJ Hussey
131,1,R Dravid,0,Royal Challengers Bangalore,I Sharma,Kolkata Knight Riders,0,1.0,W Jaffer,1,0,1.0,,bowled,R Dravid
138,2,V Kohli,0,Royal Challengers Bangalore,AB Dinda,Kolkata Knight Riders,0,1.0,W Jaffer,2,0,1.0,,bowled,V Kohli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,3,SR Watson,0,Chennai Super Kings,B Kumar,Sunrisers Hyderabad,0,770.0,F du Plessis,2,0,1.0,,bowled,SR Watson
160,1,AT Rayudu,0,Chennai Super Kings,T Natarajan,Sunrisers Hyderabad,0,770.0,F du Plessis,5,0,1.0,,bowled,AT Rayudu
165,6,KM Jadhav,0,Chennai Super Kings,T Natarajan,Sunrisers Hyderabad,0,770.0,F du Plessis,5,0,1.0,,run out,F du Plessis
179,2,KM Jadhav,0,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,MS Dhoni,8,0,1.0,[DA Warner],caught,KM Jadhav


In [42]:
deliveries_df['wicket_kind'].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field'], dtype=object)

In [46]:
print(deliveries_df[deliveries_df['wicket'] == 1]['non_striker'].value_counts())

RG Sharma            195
V Kohli              185
SK Raina             171
KD Karthik           159
RV Uthappa           157
                    ... 
M Prasidh Krishna      1
CK Langeveldt          1
NA Saini               1
Vishnu Vinod           1
RR Sarwan              1
Name: non_striker, Length: 450, dtype: int64


In [52]:
deliveries_df[deliveries_df['wicket'] == 1]['bowler'].value_counts()

SL Malinga         188
DJ Bravo           168
A Mishra           168
PP Chawla          163
Harbhajan Singh    161
                  ... 
MK Tiwary            1
AA Noffke            1
AM Rahane            1
AC Voges             1
CJ McKay             1
Name: bowler, Length: 360, dtype: int64

In [55]:
deliveries_df[deliveries_df['wicket_kind'] == 'stumped']['bowler'].value_counts()

A Mishra           27
Harbhajan Singh    18
PP Chawla          16
YS Chahal          12
PP Ojha            12
                   ..
V Sehwag            1
P Sahu              1
BMAJ Mendis         1
B Kumar             1
N Rana              1
Name: bowler, Length: 80, dtype: int64

In [17]:
grouped_df = deliveries_df[(deliveries_df['match_id']>756) & (deliveries_df['over']==19)].groupby(['match_id','bowler',])['total_runs'].agg('sum')
# for key,item in grouped_df:
#     a_group = grouped_df.get_group(key)
#     print(a_group[['match_id','bowler','batsman','total_runs']].sum())

In [18]:
print(grouped_df)

match_id  bowler       
757.0     DL Chahar         6
          TA Boult          8
758.0     CJ Jordan        30
          MP Stoinis       12
759.0     B Kumar           8
          DW Steyn          7
760.0     L Ngidi          30
          TK Curran        21
761.0     RD Chahar         4
          Shivam Mavi      13
762.0     JR Hazlewood     14
          K Rabada          4
763.0     S Dube           23
764.0     AD Russell        9
765.0     JC Archer        18
          M Ashwin          4
766.0     Kuldeep Yadav    11
          TK Curran        16
767.0     K Rabada          4
          KK Ahmed         12
768.0     I Udana          18
          JL Pattinson     20
769.0     JJ Bumrah         8
          K Gowtham        25
770.0     Abdul Samad      20
          SN Thakur         7
771.0     NA Saini         15
          TK Curran         4
772.0     AD Russell        7
          MP Stoinis        7
Name: total_runs, dtype: int64


In [26]:
deliveries_df['over'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [27]:
batsman_list = set(deliveries_df['batsman'].unique())
bowler_list = set(deliveries_df['bowler'].unique())

In [28]:
print("Batsman Count",len(batsman_list))
print("Bowler Count",len(bowler_list))

Batsman Count 525
Bowler Count 412


In [29]:
players = batsman_list.union(bowler_list)

In [30]:
len(players)

570

In [35]:
print(bowler_list-batsman_list)

{'K Santokie', 'NB Singh', 'K Khejroliya', 'CJ Dala', 'MB Parmar', 'I Udana', 'SC Kuggeleijn', 'O Thomas', 'AA Kazi', 'C Ganapathy', 'MJ Henry', 'A Nel', 'BW Hilfenhaus', 'AM Salvi', 'P Prasanth', 'SS Sarkar', 'DJ Willey', 'JP Behrendorff', 'Arshdeep Singh', 'JR Hazlewood', 'MG Neser', 'CV Varun', 'Gagandeep Singh', 'B Geeves', 'SS Cottrell', 'RW Price', 'Anand Rajan', 'RR Bose', 'S Midhun', 'P Suyal', 'KM Asif', 'T Shamsi', 'RA Shaikh', 'TP Sudhindra', 'GS Sandhu', 'SM Boland', 'P Amarnath', 'S Sandeep Warrier', 'SS Mundhe', 'L Ngidi', 'Harmeet Singh (2)', 'AS Roy', 'SS Agarwal', 'JW Hastings', 'Tejas Baroka'}


Unnamed: 0,ball,batsman,batsman_runs,batting_team,bowler,bowling_team,extras_runs,match_id,non_striker,over,total_runs,wicket,wicket_fielders,wicket_kind,wicket_player_out
441,5,SM Katich,1,Kings XI Punjab,Joginder Sharma,Chennai Super Kings,0,2.0,KC Sangakkara,14,1,0.0,,,
443,1,SM Katich,2,Kings XI Punjab,P Amarnath,Chennai Super Kings,0,2.0,KC Sangakkara,15,2,0.0,,,
444,2,SM Katich,0,Kings XI Punjab,P Amarnath,Chennai Super Kings,1,2.0,KC Sangakkara,15,1,0.0,,,
447,5,SM Katich,1,Kings XI Punjab,P Amarnath,Chennai Super Kings,0,2.0,KC Sangakkara,15,1,0.0,,,
449,1,SM Katich,0,Kings XI Punjab,MS Gony,Chennai Super Kings,1,2.0,KC Sangakkara,16,1,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25785,4,SM Katich,0,Kings XI Punjab,R Ashwin,Chennai Super Kings,0,110.0,LA Pomersbach,5,0,0.0,,,
25786,5,SM Katich,0,Kings XI Punjab,R Ashwin,Chennai Super Kings,0,110.0,LA Pomersbach,5,0,0.0,,,
25787,6,SM Katich,0,Kings XI Punjab,R Ashwin,Chennai Super Kings,0,110.0,LA Pomersbach,5,0,0.0,,,
25789,2,SM Katich,1,Kings XI Punjab,M Muralitharan,Chennai Super Kings,0,110.0,LA Pomersbach,6,1,0.0,,,
