In [1]:
import requests
import os
import yaml
import zipfile
import pandas as pd
import datetime
import numpy as np
import pickle
from os import path
import csv

In [2]:
CRICSHEET_URL = 'https://cricsheet.org/downloads/ipl.zip'
TARGET_PATH = os.path.abspath(os.getcwd()) + '//ipl.zip'
MATCH_CSV_PATH = os.path.abspath(os.getcwd()) + '//match_data.csv'
DELIVERIES_CSV_PATH = os.path.abspath(os.getcwd()) + '//deliveries_data.csv'
PROCESSED_FILES_LOG = os.path.abspath(os.getcwd()) + '//processed.pkl'
CHUNK_SIZE = 128

In [3]:
def download_data():    
    """
    Description
    -----------
    Downloads data from Cricsheet website and saves in 
    the current working directory
    """
    r = requests.get(CRICSHEET_URL, stream=True)
    with open(TARGET_PATH, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
            fd.write(chunk)
            


In [4]:
def initialize():    
    """
    Description
    -----------
    Initalizes the python script to download and update 
    latest IPL match information.
    
    global variables
    ----------------
    ipl_zip    : Reference to downloaded IPL Data ZIP file
    match_list : List of files within ipl_zip, where each file 
        contains data of one IPL match 
    files_processed : Set Object, that contains list of all the 
        files that are already processed.
    matches_df : Pandas Dataframe, it creates and updates IPL Match 
        overview in MATCH_CSV_PATH
    match_id   : Unique ID that corresponds to each match record
    deliveries_df : Pandas Dataframe, creates and updates ball 
        by ball information of an IPL Match. 
    """
    global ipl_zip, match_list, files_processed, matches_df, match_id, deliveries_df
    download_data()
    ipl_zip = zipfile.ZipFile(TARGET_PATH)
    match_list = ipl_zip.namelist()
    print("Total Match Records: ", len(match_list))

    # Load Files Processed List
    if(path.isfile(PROCESSED_FILES_LOG) == False):
        with open(PROCESSED_FILES_LOG, "wb") as file_handle:
            pickle.dump({"README.txt"}, file_handle)

    with open(PROCESSED_FILES_LOG, "rb") as file_handle:
        files_processed = pickle.load(file_handle)
    
    # Load Match File Data
    matches_columns = ['dates','city','season','host_team','visiting_team', 'toss_winner','toss_decision','venue','result','winner','player_of_match']
    if(path.isfile(MATCH_CSV_PATH) == False):
        matches_df = pd.DataFrame(columns=matches_columns)
        match_id = int(0)
    else:
        matches_df = pd.read_csv(MATCH_CSV_PATH)
        match_id = int(matches_df['match_id'].max())
        
    # Load Deliveries File data
    deliveries_columns = ['batting_team','bowling_team','batsman','bowler','over','ball','non_striker','total_runs','batsman_runs','extras_runs']
    if(path.isfile(MATCH_CSV_PATH) == False):
        deliveries_df = pd.DataFrame(columns=deliveries_columns)
    else:
        deliveries_df = pd.read_csv(DELIVERIES_CSV_PATH)
    

In [5]:
def save_processed_files():
    """
    Description:
    ------------
    This method is called at the end after processing the data to
    store the newly created or updated records to memory.
    """
    with open(PROCESSED_FILES_LOG, "wb") as file_handle:
        pickle.dump(files_processed, file_handle)
    matches_df.to_csv (MATCH_CSV_PATH, index = False, header=True)
    deliveries_df.to_csv (DELIVERIES_CSV_PATH, index = False, header=True)

In [6]:
def get_data(obj, key):
    """
    Returns np.nan if the requested value is not 
    present withiin the object
    """
    try:
        if isinstance(key, list):
            for each in key:
                obj = obj[each]
            return obj
        else:
            if key in obj:
                return obj[key]
    except:
        print('ERROR: key: ',key, ' | obj: ', obj)
        return np.nan

In [7]:
def get_match_data(match_id, info):
    """
    Extracts information from the info object and returns 
    structured information about the match.
    
    Returns
    -------
    A dictionary with values for the following keys
    
    Keys:
    --------
    match_id : int, Unique ID associated with each IPL match
    dates : DateTime, Date of the match
    city : String, City where the match is played
    season : int, year the match is played
    host_team : str, name of the host team
    visiting_team : str, name of the visiting team
    toss_winner : str, name of the team which won the toss
    toss_decision : str, either 'bat' or 'field'
    venue : str, name of the cricket ground
    result : int, 0 - No Result, 1 - One team won, 2 - Tie
    winner : str, Name of the team which won
        In case of No result, 'no result'
        In case of Tie, Team which won the eliminator
    player_of_match : str, Name of player who won the 
        Man of the match award
    """
    
    data = {}

    data['match_id'] = int(match_id)

    data['dates'] = get_data(info, ['dates', 0])
    if(isinstance(data['dates'], datetime.date) != True):
        data['dates'] = datetime.datetime.strptime(data['dates'], '%Y-%m-%d')

    data['city'] = get_data(info,'city')
    data['season'] = data['dates'].year
    data['host_team'] = get_data(info,['teams', 0])
    data['visiting_team'] = get_data(info, ['teams', 1])
    data['toss_winner'] = get_data(info, ['toss', 'winner'])
    data['toss_decision'] = get_data(info, ['toss', 'decision'])
    data['venue'] = get_data(info, 'venue')
    if 'winner' in info['outcome']:
        data['winner'] = get_data(info, ['outcome','winner'])
        # Result : 0 - no result, 1 - winner, 2 - tie
        data['result'] = 1
        if(data['winner'] == 'no result'):
            data['result'] = 0

    elif 'eliminator' in info['outcome']:
            data['result'] = 2
            data['winner'] = get_data(info, ['outcome', 'eliminator'])
    else:
        data['winner'] = 'no result'
        data['result'] = 0

    if 'player_of_match' in info:
        data['player_of_match'] = get_data(info, ['player_of_match', 0])
    
    return data

In [20]:
def get_delivery_data(match_id, reader, batting_team, bowling_team):
    """
    Extract ball by ball information about a match and return a 
    detailed structured information about the match.
    
    Returns:
    --------
    A dictionary filled with values for following keys
    
    Keys:
    -----
    match_id : int, Unique ID associated with each IPL match
    batting_team : str, Name of the batting team
    bowling_team: str, Name of the bowling team
    over : int, Over associated with the record
    ball : int, ball number associated with the record
    batsman : str, Name of Batsman in strike
    bowler : str, Name of Bowler bowling the over
    non_striker : str, Name of the Non Striker Batsman
    total_runs : int, Total runs scored including extras for 
        the delivery
    batsman_runs : int, Runs scored by batsman
    extras_runs : int, Runs as extras for the given delivery
    wicket : int, either 0 or 1
        0 - No wicket has fallen in the given delivery
        1 - Wicket has fallen for the given delivery
    wicket_kind: Possible values: 'lbw', 'caught', 'bowled', 
        'run out', 'retired hurt', 'stumped', 'hit wicket',  
        'caught and bowled', 'obstructing the field'
    wicket_fielders : List, list of fielders involved
    player_out : str, Name of the player that got out
    """
    
    data = {}
    delivery = list(reader.keys())[0]
    delivery_obj = get_data(reader, delivery)
    data['match_id'] = match_id
    data['batting_team'] = batting_team
    data['bowling_team'] = bowling_team
    data['over'], data['ball'] = str(delivery).split('.')
    data['over'],data['ball'] = int(data['over']),int(data['ball'])
    data['batsman'] = get_data(delivery_obj, 'batsman')
    data['bowler'] = get_data(delivery_obj, 'bowler')
    data['non_striker'] = get_data(delivery_obj, 'non_striker')
    data['total_runs'] = get_data(delivery_obj, ['runs', 'total'])
    data['batsman_runs'] = get_data(delivery_obj, ['runs', 'batsman'])
    data['extras_runs'] = get_data(delivery_obj, ['runs', 'extras'])
    
    if 'wicket' in delivery_obj:
        data['wicket'] = 1
        data['wicket_kind'] = get_data(delivery_obj, ['wicket', 'kind'])
        data['wicket_player_out'] = get_data(delivery_obj, ['wicket', 'player_out'])
        if 'fielders' in delivery_obj['wicket']:
            data['wicket_fielders'] = get_data(delivery_obj, ['wicket', 'fielders'])
    else:
        data['wicket'] = 0
    return data
    
    
def get_deliveries_data(match_id, reader): 
    collection = []
    
    innings = reader['innings']
    first_innings = get_data(innings, [0, '1st innings'])
    second_innings = get_data(innings, [1, '2nd innings'])
    
    teams = get_data(reader, ['info', 'teams'])
    if((get_data(reader, ['info', 'toss', 'winner']) == teams[0] and get_data(reader, ['info', 'toss', 'decision']) == 'bat') or 
       (get_data(reader, ['info', 'toss', 'winner']) == teams[1] and get_data(reader, ['info', 'toss', 'decision']) == 'field')):
        team1, team2 = teams[0], teams[1]
    else:
        team1, team2 = teams[1], teams[0]
    
    if isinstance (first_innings, dict):
        for each_delivery in get_data(first_innings, 'deliveries'):
            collection.append(get_delivery_data(match_id, each_delivery, team1, team2))
    if isinstance(second_innings, dict):
        for each_delivery in get_data(second_innings, 'deliveries'):
            collection.append(get_delivery_data(match_id, each_delivery, team2, team1))
        
    return collection

In [22]:
initialize()

Total Match Records:  771


In [37]:
for match in match_list:
    if(match in files_processed):
        continue

    match_id = int(match_id + 1)

    with ipl_zip.open(match) as yamlfile:
        reader = yaml.safe_load(yamlfile)

        print(match_id,". Processing File : ", match)
        info = reader['info']
        
        match_data = get_match_data(match_id, info)
        matches_df = matches_df.append([match_data])
        
        deliveries_data = get_deliveries_data(match_id, reader)
        deliveries_df = deliveries_df.append(deliveries_data)
        
        files_processed.add(match)

save_processed_files()

10 . Processing File :  335991.yaml
11 . Processing File :  335993.yaml
12 . Processing File :  335992.yaml
13 . Processing File :  335994.yaml
14 . Processing File :  335995.yaml
15 . Processing File :  335996.yaml
16 . Processing File :  335997.yaml
17 . Processing File :  335998.yaml
18 . Processing File :  336000.yaml
19 . Processing File :  335999.yaml
20 . Processing File :  336001.yaml
21 . Processing File :  336003.yaml
22 . Processing File :  336034.yaml
23 . Processing File :  336005.yaml
24 . Processing File :  336004.yaml
25 . Processing File :  336006.yaml
26 . Processing File :  336007.yaml
27 . Processing File :  336008.yaml
28 . Processing File :  336009.yaml
29 . Processing File :  336010.yaml
30 . Processing File :  336011.yaml
31 . Processing File :  336013.yaml
32 . Processing File :  336014.yaml
33 . Processing File :  336015.yaml
34 . Processing File :  336016.yaml
35 . Processing File :  336017.yaml
36 . Processing File :  336018.yaml
37 . Processing File :  3360

234 . Processing File :  501257.yaml
235 . Processing File :  501258.yaml
236 . Processing File :  501259.yaml
237 . Processing File :  501260.yaml
238 . Processing File :  501261.yaml
239 . Processing File :  501262.yaml
240 . Processing File :  501263.yaml
241 . Processing File :  501264.yaml
242 . Processing File :  501265.yaml
ERROR: key:  [1, '2nd innings']  | obj:  [{'1st innings': {'team': 'Delhi Daredevils', 'deliveries': [{0.1: {'batsman': 'NV Ojha', 'bowler': 'AC Thomas', 'non_striker': 'DA Warner', 'runs': {'batsman': 1, 'extras': 0, 'total': 1}}}, {0.2: {'batsman': 'DA Warner', 'bowler': 'AC Thomas', 'extras': {'legbyes': 1}, 'non_striker': 'NV Ojha', 'runs': {'batsman': 0, 'extras': 1, 'total': 1}}}, {0.3: {'batsman': 'NV Ojha', 'bowler': 'AC Thomas', 'non_striker': 'DA Warner', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.4: {'batsman': 'NV Ojha', 'bowler': 'AC Thomas', 'non_striker': 'DA Warner', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.5: {'batsman

243 . Processing File :  501266.yaml
244 . Processing File :  501267.yaml
245 . Processing File :  501268.yaml
246 . Processing File :  501269.yaml
247 . Processing File :  501270.yaml
248 . Processing File :  501271.yaml
249 . Processing File :  548306.yaml
250 . Processing File :  548307.yaml
251 . Processing File :  548308.yaml
252 . Processing File :  548309.yaml
253 . Processing File :  548311.yaml
254 . Processing File :  548310.yaml
255 . Processing File :  548313.yaml
256 . Processing File :  548312.yaml
257 . Processing File :  548314.yaml
258 . Processing File :  548315.yaml
259 . Processing File :  548316.yaml
260 . Processing File :  548317.yaml
261 . Processing File :  548318.yaml
262 . Processing File :  548319.yaml
263 . Processing File :  548320.yaml
264 . Processing File :  548322.yaml
265 . Processing File :  548323.yaml
266 . Processing File :  548324.yaml
267 . Processing File :  548325.yaml
268 . Processing File :  548326.yaml
269 . Processing File :  548327.yaml
2

465 . Processing File :  829745.yaml
466 . Processing File :  829747.yaml
467 . Processing File :  829749.yaml
468 . Processing File :  829705.yaml
469 . Processing File :  829707.yaml
470 . Processing File :  829709.yaml
471 . Processing File :  829713.yaml
472 . Processing File :  829711.yaml
473 . Processing File :  829715.yaml
474 . Processing File :  829717.yaml
475 . Processing File :  829719.yaml
476 . Processing File :  829721.yaml
477 . Processing File :  829725.yaml
478 . Processing File :  829727.yaml
479 . Processing File :  829729.yaml
480 . Processing File :  829731.yaml
481 . Processing File :  829751.yaml
482 . Processing File :  829753.yaml
483 . Processing File :  829757.yaml
484 . Processing File :  829759.yaml
485 . Processing File :  829765.yaml
486 . Processing File :  829723.yaml
487 . Processing File :  829763.yaml
ERROR: key:  [1, '2nd innings']  | obj:  [{'1st innings': {'team': 'Royal Challengers Bangalore', 'deliveries': [{0.1: {'batsman': 'CH Gayle', 'bowle

488 . Processing File :  829767.yaml
489 . Processing File :  829769.yaml
490 . Processing File :  829771.yaml
491 . Processing File :  829773.yaml
492 . Processing File :  829775.yaml
493 . Processing File :  829777.yaml
494 . Processing File :  829779.yaml
495 . Processing File :  829781.yaml
496 . Processing File :  829783.yaml
497 . Processing File :  829785.yaml
498 . Processing File :  829787.yaml
499 . Processing File :  829761.yaml
500 . Processing File :  829789.yaml
501 . Processing File :  829791.yaml
502 . Processing File :  829793.yaml
503 . Processing File :  829795.yaml
504 . Processing File :  829797.yaml
505 . Processing File :  829799.yaml
506 . Processing File :  829801.yaml
507 . Processing File :  829803.yaml
508 . Processing File :  829805.yaml
509 . Processing File :  829807.yaml
510 . Processing File :  829809.yaml
511 . Processing File :  829811.yaml
512 . Processing File :  829813.yaml
513 . Processing File :  829815.yaml
514 . Processing File :  829817.yaml
5

706 . Processing File :  1136597.yaml
707 . Processing File :  1136598.yaml
708 . Processing File :  1136599.yaml
709 . Processing File :  1136600.yaml
710 . Processing File :  1136601.yaml
711 . Processing File :  1136602.yaml
712 . Processing File :  1136603.yaml
713 . Processing File :  1136604.yaml
714 . Processing File :  1136605.yaml
715 . Processing File :  1136606.yaml
716 . Processing File :  1136607.yaml
717 . Processing File :  1136608.yaml
718 . Processing File :  1136609.yaml
719 . Processing File :  1136610.yaml
720 . Processing File :  1136611.yaml
721 . Processing File :  1136612.yaml
722 . Processing File :  1136613.yaml
723 . Processing File :  1136614.yaml
724 . Processing File :  1136615.yaml
725 . Processing File :  1136616.yaml
726 . Processing File :  1136617.yaml
727 . Processing File :  1136618.yaml
728 . Processing File :  1136619.yaml
729 . Processing File :  1136620.yaml
730 . Processing File :  1178409.yaml
731 . Processing File :  1178410.yaml
732 . Proces

### Note: Following code blocks are executed to validate the data

In [35]:
matches_df

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
0,Bangalore,2008-04-18,Royal Challengers Bangalore,1.0,BB McCullum,1,2008,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Kolkata Knight Riders,Kolkata Knight Riders
0,Chandigarh,2008-04-19,Kings XI Punjab,2.0,MEK Hussey,1,2008,bat,Chennai Super Kings,"Punjab Cricket Association Stadium, Mohali",Chennai Super Kings,Chennai Super Kings
0,Delhi,2008-04-19 00:00:00,Delhi Daredevils,3.0,MF Maharoof,1,2008,bat,Rajasthan Royals,Feroz Shah Kotla,Rajasthan Royals,Delhi Daredevils
0,Kolkata,2008-04-20,Kolkata Knight Riders,4.0,DJ Hussey,1,2008,bat,Deccan Chargers,Eden Gardens,Deccan Chargers,Kolkata Knight Riders
0,Mumbai,2008-04-20,Mumbai Indians,5.0,MV Boucher,1,2008,bat,Mumbai Indians,Wankhede Stadium,Royal Challengers Bangalore,Royal Challengers Bangalore
0,Jaipur,2008-04-21,Rajasthan Royals,6.0,SR Watson,1,2008,bat,Kings XI Punjab,Sawai Mansingh Stadium,Kings XI Punjab,Rajasthan Royals
0,Hyderabad,2008-04-22,Deccan Chargers,7.0,V Sehwag,1,2008,bat,Deccan Chargers,"Rajiv Gandhi International Stadium, Uppal",Delhi Daredevils,Delhi Daredevils
0,Chennai,2008-04-23,Chennai Super Kings,8.0,ML Hayden,1,2008,field,Mumbai Indians,"MA Chidambaram Stadium, Chepauk",Mumbai Indians,Chennai Super Kings
0,Hyderabad,2008-04-24,Deccan Chargers,9.0,YK Pathan,1,2008,field,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",Rajasthan Royals,Rajasthan Royals


In [39]:
# deliveries_df = deliveries_df.set_index(['match_id','batting_team','bowling_team','over','ball'])
deliveries_df

Unnamed: 0,ball,batsman,batsman_runs,batting_team,bowler,bowling_team,extras_runs,match_id,non_striker,over,total_runs,wicket,wicket_fielders,wicket_kind,wicket_player_out
0,1,SC Ganguly,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,1,1.0,BB McCullum,0,1,0.0,,,
1,2,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
2,3,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,1,1.0,SC Ganguly,0,1,0.0,,,
3,4,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
4,5,BB McCullum,0,Kolkata Knight Riders,P Kumar,Royal Challengers Bangalore,0,1.0,SC Ganguly,0,0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,3,MS Dhoni,4,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,SM Curran,19,4,0.0,,,
248,4,MS Dhoni,1,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,SM Curran,19,1,0.0,,,
249,5,SM Curran,1,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,MS Dhoni,19,1,0.0,,,
250,6,MS Dhoni,1,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,SM Curran,19,1,0.0,,,


In [28]:
matches_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 0
Data columns (total 12 columns):
city               4 non-null object
dates              4 non-null object
host_team          4 non-null object
match_id           4 non-null float64
player_of_match    4 non-null object
result             4 non-null object
season             4 non-null object
toss_decision      4 non-null object
toss_winner        4 non-null object
venue              4 non-null object
visiting_team      4 non-null object
winner             4 non-null object
dtypes: float64(1), object(11)
memory usage: 416.0+ bytes


In [29]:
deliveries_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 932 entries, 0 to 239
Data columns (total 15 columns):
ball                 932 non-null object
batsman              932 non-null object
batsman_runs         932 non-null object
batting_team         932 non-null object
bowler               932 non-null object
bowling_team         932 non-null object
extras_runs          932 non-null object
match_id             932 non-null float64
non_striker          932 non-null object
over                 932 non-null object
total_runs           932 non-null object
wicket               932 non-null float64
wicket_fielders      30 non-null object
wicket_kind          46 non-null object
wicket_player_out    46 non-null object
dtypes: float64(2), object(13)
memory usage: 116.5+ KB


In [30]:
matches_df.describe()

Unnamed: 0,match_id
count,4.0
mean,2.5
std,1.290994
min,1.0
25%,1.75
50%,2.5
75%,3.25
max,4.0


In [31]:
deliveries_df.describe()

Unnamed: 0,match_id,wicket
count,932.0,932.0
mean,2.508584,0.049356
std,1.117641,0.216727
min,1.0,0.0
25%,2.0,0.0
50%,2.0,0.0
75%,4.0,0.0
max,4.0,1.0


In [41]:
matches_df[matches_df.result == 2]

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
0,Cape Town,2009-04-23,Kolkata Knight Riders,67.0,YK Pathan,2,2009,field,Kolkata Knight Riders,Newlands,Rajasthan Royals,Rajasthan Royals
0,Chennai,2010-03-21,Chennai Super Kings,131.0,J Theron,2,2010,field,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",Kings XI Punjab,Kings XI Punjab
0,Hyderabad,2013-04-07,Sunrisers Hyderabad,328.0,GH Vihari,2,2013,bat,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,Sunrisers Hyderabad
0,Bangalore,2013-04-16,Royal Challengers Bangalore,343.0,V Kohli,2,2013,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Delhi Daredevils,Royal Challengers Bangalore
0,Abu Dhabi,2014-04-29,Kolkata Knight Riders,417.0,JP Faulkner,2,2014,bat,Rajasthan Royals,Sheikh Zayed Stadium,Rajasthan Royals,Rajasthan Royals
0,Ahmedabad,2015-04-21,Rajasthan Royals,463.0,SE Marsh,2,2015,field,Kings XI Punjab,"Sardar Patel Stadium, Motera",Kings XI Punjab,Kings XI Punjab
0,Rajkot,2017-04-29,Gujarat Lions,611.0,KH Pandya,2,2017,bat,Gujarat Lions,Saurashtra Cricket Association Stadium,Mumbai Indians,Mumbai Indians
0,Delhi,2019-03-30 00:00:00,Delhi Capitals,646.0,PP Shaw,2,2019,field,Delhi Capitals,Feroz Shah Kotla,Kolkata Knight Riders,Delhi Capitals
0,Mumbai,2019-05-02 00:00:00,Mumbai Indians,747.0,JJ Bumrah,2,2019,bat,Mumbai Indians,Wankhede Stadium,Sunrisers Hyderabad,Mumbai Indians
0,,2020-09-20 00:00:00,Delhi Capitals,758.0,MP Stoinis,2,2020,field,Kings XI Punjab,Dubai International Cricket Stadium,Kings XI Punjab,Delhi Capitals


In [83]:
matches_df['dates'] = pd.to_datetime(matches_df['dates'])

In [90]:
matches_df[matches_df['dates'].dt.year == 2020]

Unnamed: 0_level_0,city,dates,host_team,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [85]:
matches_df.to_csv (MATCH_CSV_PATH, index = False, header=True)
deliveries_df.to_csv (DELIVERIES_CSV_PATH, index = False, header=True)

In [38]:
matches_df[matches_df.isna().any(axis=1)]

Unnamed: 0,city,dates,host_team,match_id,player_of_match,result,season,toss_decision,toss_winner,venue,visiting_team,winner
0,Delhi,2011-05-21,Delhi Daredevils,242.0,,0,2011,bat,Delhi Daredevils,Feroz Shah Kotla,Pune Warriors,no result
0,,2014-04-17,Delhi Daredevils,400.0,YS Chahal,1,2014,field,Royal Challengers Bangalore,Sharjah Cricket Stadium,Royal Challengers Bangalore,Royal Challengers Bangalore
0,,2014-04-19,Kolkata Knight Riders,403.0,JP Duminy,1,2014,bat,Kolkata Knight Riders,Dubai International Cricket Stadium,Delhi Daredevils,Delhi Daredevils
0,,2014-04-19,Royal Challengers Bangalore,404.0,PA Patel,1,2014,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,Mumbai Indians,Royal Challengers Bangalore
0,,2014-04-20,Rajasthan Royals,405.0,GJ Maxwell,1,2014,field,Kings XI Punjab,Sharjah Cricket Stadium,Kings XI Punjab,Kings XI Punjab
0,,2014-04-22,Kings XI Punjab,407.0,GJ Maxwell,1,2014,field,Sunrisers Hyderabad,Sharjah Cricket Stadium,Sunrisers Hyderabad,Kings XI Punjab
0,,2014-04-23,Rajasthan Royals,408.0,RA Jadeja,1,2014,field,Rajasthan Royals,Dubai International Cricket Stadium,Chennai Super Kings,Chennai Super Kings
0,,2014-04-24,Royal Challengers Bangalore,409.0,CA Lynn,1,2014,field,Royal Challengers Bangalore,Sharjah Cricket Stadium,Kolkata Knight Riders,Kolkata Knight Riders
0,,2014-04-25,Chennai Super Kings,410.0,MM Sharma,1,2014,bat,Mumbai Indians,Dubai International Cricket Stadium,Mumbai Indians,Chennai Super Kings
0,,2014-04-25,Sunrisers Hyderabad,411.0,AJ Finch,1,2014,bat,Sunrisers Hyderabad,Dubai International Cricket Stadium,Delhi Daredevils,Sunrisers Hyderabad


array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'AC Gilchrist',
       'Y Venugopal Rao', 'VVS Laxman', 'A Symonds', 'RG Sharma',
       'SB Styris', 'AS Yadav', 'SB Bangar', 'WPUJC Vaas', 'RP Singh',
       'WP Saha', 'LR Shukla', 'L Ronchi', 'ST Jayasuriya', 'DJ Thornely',
       'RV Uthappa', 'PR Shah', 'AM Nayar', 'SM Pollock',
       'Harbhajan Singh', 'S Chanderpaul', 'LRPL Taylor',
       'DPMD Jayawardene', 'S Sohal', 'B Lee', 'PP Cha

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,batsman,batsman_runs,bowler,extras_runs,non_striker,total_runs
match_id,batting_team,bowling_team,over,ball,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,0,P Kumar,1,BB McCullum,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,0,P Kumar,1,SC Ganguly,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,6,BB McCullum,0,P Kumar,0,SC Ganguly,0
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,0,7,BB McCullum,0,P Kumar,1,SC Ganguly,1
1.0,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,BB McCullum,0,Z Khan,0,SC Ganguly,0


In [40]:
deliveries_df[deliveries_df['wicket'] == 1]

Unnamed: 0,ball,batsman,batsman_runs,batting_team,bowler,bowling_team,extras_runs,match_id,non_striker,over,total_runs,wicket,wicket_fielders,wicket_kind,wicket_player_out
33,2,SC Ganguly,0,Kolkata Knight Riders,Z Khan,Royal Challengers Bangalore,0,1.0,BB McCullum,5,0,1.0,[JH Kallis],caught,SC Ganguly
74,1,RT Ponting,0,Kolkata Knight Riders,JH Kallis,Royal Challengers Bangalore,0,1.0,BB McCullum,12,0,1.0,[P Kumar],caught,RT Ponting
106,1,DJ Hussey,0,Kolkata Knight Riders,AA Noffke,Royal Challengers Bangalore,0,1.0,BB McCullum,17,0,1.0,[CL White],caught,DJ Hussey
131,1,R Dravid,0,Royal Challengers Bangalore,I Sharma,Kolkata Knight Riders,0,1.0,W Jaffer,1,0,1.0,,bowled,R Dravid
138,2,V Kohli,0,Royal Challengers Bangalore,AB Dinda,Kolkata Knight Riders,0,1.0,W Jaffer,2,0,1.0,,bowled,V Kohli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,3,SR Watson,0,Chennai Super Kings,B Kumar,Sunrisers Hyderabad,0,770.0,F du Plessis,2,0,1.0,,bowled,SR Watson
160,1,AT Rayudu,0,Chennai Super Kings,T Natarajan,Sunrisers Hyderabad,0,770.0,F du Plessis,5,0,1.0,,bowled,AT Rayudu
165,6,KM Jadhav,0,Chennai Super Kings,T Natarajan,Sunrisers Hyderabad,0,770.0,F du Plessis,5,0,1.0,,run out,F du Plessis
179,2,KM Jadhav,0,Chennai Super Kings,Abdul Samad,Sunrisers Hyderabad,0,770.0,MS Dhoni,8,0,1.0,[DA Warner],caught,KM Jadhav


In [42]:
deliveries_df['wicket_kind'].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field'], dtype=object)

In [46]:
print(deliveries_df[deliveries_df['wicket'] == 1]['non_striker'].value_counts())

RG Sharma            195
V Kohli              185
SK Raina             171
KD Karthik           159
RV Uthappa           157
                    ... 
M Prasidh Krishna      1
CK Langeveldt          1
NA Saini               1
Vishnu Vinod           1
RR Sarwan              1
Name: non_striker, Length: 450, dtype: int64


In [52]:
deliveries_df[deliveries_df['wicket'] == 1]['bowler'].value_counts()

SL Malinga         188
DJ Bravo           168
A Mishra           168
PP Chawla          163
Harbhajan Singh    161
                  ... 
MK Tiwary            1
AA Noffke            1
AM Rahane            1
AC Voges             1
CJ McKay             1
Name: bowler, Length: 360, dtype: int64

In [55]:
deliveries_df[deliveries_df['wicket_kind'] == 'stumped']['bowler'].value_counts()

A Mishra           27
Harbhajan Singh    18
PP Chawla          16
YS Chahal          12
PP Ojha            12
                   ..
V Sehwag            1
P Sahu              1
BMAJ Mendis         1
B Kumar             1
N Rana              1
Name: bowler, Length: 80, dtype: int64