In [1]:
import time
notebook_start_time = time.time()

import os
import sys

from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

pd.set_option('MAX_COLUMNS', 50)

In [2]:
matches = list(filter(lambda x: x[-3:]=='csv', os.listdir()))
print('Number of matches in the data : {0}'.format(len(matches)))

Number of matches in the data : 751


In [3]:
def delete_files(address):
    if address not in os.listdir():
        os.mkdir(address)
        return 0
    for file in os.listdir(address):
        os.remove(address+'/'+file)
    return 1 

In [4]:
hashlist = []

delete_files('balldata')

for match in matches:
    hashmap = {}
    with open(match) as matchdata, open('temp.csv', 'w') as out:
        for line in matchdata:
            balldata = line.split(',')
            if balldata[0]=="info":
                #print(balldata)
                if balldata[1] not in hashmap:
                    hashmap[balldata[1]] = balldata[2][:-1]
                else:
                    hashmap[balldata[1]+'2'] = balldata[2][:-1]
            elif balldata[0]=="ball":
                out.write(line)
        
        hashmap['match_id'] = match        
        hashlist.append(hashmap)
    os.rename('temp.csv', 'balldata/'+match)

In [5]:
hashmap

{'team': 'Royal Challengers Bangalore',
 'team2': 'Sunrisers Hyderabad',
 'gender': 'male',
 'season': '2016',
 'date': '2016/05/29',
 'competition': 'Indian Premier League',
 'venue': 'M Chinnaswamy Stadium',
 'city': 'Bangalore',
 'toss_winner': 'Sunrisers Hyderabad',
 'toss_decision': 'bat',
 'player_of_match': 'BCJ Cutting',
 'umpire': 'HDPK Dharmasena',
 'umpire2': 'BNJ Oxenford',
 'reserve_umpire': 'A Deshmukh',
 'tv_umpire': 'AK Chaudhary',
 'match_referee': 'RS Madugalle',
 'winner': 'Sunrisers Hyderabad',
 'winner_runs': '8',
 'match_id': '981019.csv'}

In [6]:
metadata = pd.DataFrame(hashlist)
meta = metadata.copy()

In [7]:
cols = ['ball', 'innings', 'over', 'team', 'striker', 'non-striker', 'bowler', 'runs', 'extras', 'wicket', 'out_batsmen']

os.chdir('balldata')

for match in os.listdir():
    try:
        df = pd.read_csv(match, header=None)
        df.columns = cols
        df.to_csv(match, index=False)
    except:
        print(match)
        os.remove(match)
        os.remove('../'+match)
        

os.chdir('..')

In [8]:
def convertDate(d):
    if type(d)==float:
        return np.nan
    else:
        return datetime.strptime(d, "%Y/%m/%d")

metadata['date'] = meta['date'].map(lambda x: convertDate(x))

In [9]:
metadata.sort_values(by="date", inplace=True)

In [10]:
metadata.head()

Unnamed: 0,city,competition,date,date2,eliminator,gender,match_id,match_number,match_referee,method,neutralvenue,outcome,player_of_match,reserve_umpire,season,team,team2,toss_decision,toss_winner,tv_umpire,umpire,umpire2,venue,winner,winner_runs,winner_wickets
174,Bangalore,Indian Premier League,2008-04-18,,,male,335982.csv,1,J Srinath,,,,BB McCullum,VN Kulkarni,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,field,Royal Challengers Bangalore,AM Saheba,Asad Rauf,RE Koertzen,M Chinnaswamy Stadium,Kolkata Knight Riders,140.0,
176,Delhi,Indian Premier League,2008-04-19,,,male,335984.csv,3,GR Viswanath,,,,MF Maharoof,,2007/08,Delhi Daredevils,Rajasthan Royals,bat,Rajasthan Royals,IL Howell,Aleem Dar,GA Pratapkumar,Feroz Shah Kotla,Delhi Daredevils,,9.0
175,Chandigarh,Indian Premier League,2008-04-19,,,male,335983.csv,2,S Venkataraghavan,,,,MEK Hussey,MSS Ranawat,2007/08,Kings XI Punjab,Chennai Super Kings,bat,Chennai Super Kings,RB Tiffin,MR Benson,SL Shastri,"""Punjab Cricket Association Stadiu",Chennai Super Kings,33.0,
177,Mumbai,Indian Premier League,2008-04-20,,,male,335985.csv,5,J Srinath,,,,MV Boucher,SN Bandekar,2007/08,Mumbai Indians,Royal Challengers Bangalore,bat,Mumbai Indians,AV Jayaprakash,SJ Davis,DJ Harper,Wankhede Stadium,Royal Challengers Bangalore,,5.0
178,Kolkata,Indian Premier League,2008-04-20,,,male,335986.csv,4,FM Engineer,,,,DJ Hussey,F Gomes,2007/08,Kolkata Knight Riders,Deccan Chargers,bat,Deccan Chargers,Asad Rauf,BF Bowden,K Hariharan,Eden Gardens,Kolkata Knight Riders,,5.0


# Let's explore NaN values (missing values).

In [11]:
metadata.isnull().sum()

city                 0
competition          0
date                 0
date2              749
eliminator         742
gender               0
match_id             0
match_number        46
match_referee        0
method             732
neutralvenue       674
outcome            738
player_of_match      4
reserve_umpire      25
season               0
team                 0
team2                0
toss_decision        0
toss_winner          0
tv_umpire            2
umpire               1
umpire2              1
venue                0
winner              13
winner_runs        414
winner_wickets     350
dtype: int64

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
    <div><strong>There are 5 NaN values in several columns.</strong></div>
    <div><strong>Let's see if it is just coincidence(which I don't like) or do we have 5 empty rows.... </strong></div>
</div>

In [12]:
metadata.loc[(metadata.city.isnull())]

Unnamed: 0,city,competition,date,date2,eliminator,gender,match_id,match_number,match_referee,method,neutralvenue,outcome,player_of_match,reserve_umpire,season,team,team2,toss_decision,toss_winner,tv_umpire,umpire,umpire2,venue,winner,winner_runs,winner_wickets


<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">

<div><strong>files 1136561.csv-1136565.csv are empty</strong></div>

<div>Deleting these empty rows is a reasonable option now.</div>
</div>

In [13]:
metadata = metadata.loc[np.logical_not(metadata.city.isnull())]
meta = meta.loc[np.logical_not(meta.city.isnull())]
metadata.isnull().sum()

city                 0
competition          0
date                 0
date2              749
eliminator         742
gender               0
match_id             0
match_number        46
match_referee        0
method             732
neutralvenue       674
outcome            738
player_of_match      4
reserve_umpire      25
season               0
team                 0
team2                0
toss_decision        0
toss_winner          0
tv_umpire            2
umpire               1
umpire2              1
venue                0
winner              13
winner_runs        414
winner_wickets     350
dtype: int64

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
    <div>Yaay..!! now a whole lot of columns are clear with <strong>Zero</strong> missing values.</div>

<div>Don't get too excited already, we still have a lot of missing values to deal with.</div>

<div><strong>Date2</strong> has 749 missing values. Since, having a secondary date for a match is very rare it makes sense to have a lot of missing values.</div>

<h2> Data Processing </h2>
<ul>
    <li>As the matches with a not-NaN value for <strong>date2</strong> are played on date2, we are going to have an ultimate date variable <strong>"match_date"</strong> holding the date on which the match was held on.</li>
    <li><strong>Gender</strong> will be removed all the matches are of Men's category and hence <strong>Gender</strong> variable holds <strong>zero</strong> information.</li>
    <li><strong>Neutral Venue</strong> will also converted to boolean</li>
    <li>Umpires don't have any influence on the outcome of a game(Assuming they are fair). As of now, I am not going to consider Umpire data for further analysis</li>
    <li><strong>Winner</strong> and <strong>outcome</strong> go hand in hand. So, combining these two variables will make our data clear.</li>
</ul>

<div>After cleaning the data as mentioned above, we are going to have a look at <strong>match_number, method, player_of_match, eliminator</strong> columns.</div>
</div>

In [14]:
#merging date and date2
match_dates = []
for date1, date2 in zip(metadata.date.values, metadata.date2.values):
    if type(date2)==float:
        match_dates.append(date1)
    else:
        match_dates.append(date2)

metadata['match_date'] = match_dates
metadata.drop(['date', 'date2'], axis=1, inplace=True)

In [15]:
#removing umpire related variables
metadata.drop('gender', axis=1, inplace=True)
metadata['neutralvenue'] = meta['neutralvenue'].map(lambda x: 0 if type(x)==float else 1)
metadata.drop(['reserve_umpire', 'tv_umpire', 'umpire', 'umpire2', 'match_referee'], axis=1, inplace=True)

In [16]:
#merging outcome and winner attribute values.
outcomes=[]
for out, win in zip(metadata['outcome'], metadata['winner']):
    if type(win)==float:
        outcomes.append(out)
    else:
        outcomes.append(win)

metadata['winner'] = outcomes
metadata.drop('outcome', axis=1, inplace=True)

<h2 style="text-align: center; font-size:2.5em;"> match_number </h2>

In [17]:
metadata.loc[(metadata.match_number.isnull())].season.value_counts()

2018       4
2012       4
2009/10    4
2019       4
2016       4
2011       4
2013       4
2014       4
2017       4
2015       4
2009       3
2007/08    3
Name: season, dtype: int64

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
    <div>The variable <strong>match_number</strong> has several null values. But the null values are evenly divided among the seasons(3 in first two seasons and 4 in rest of them). </div>
    <div>Again, this cannot be coincidence(or null values have some meaning here). They probably represent knockout-stage matches.</div>
    <div>Lets see if our initial guess is true.</div>
</div>
    

In [18]:
metadata.loc[(metadata.match_number.isnull())]

Unnamed: 0,city,competition,eliminator,match_id,match_number,method,neutralvenue,player_of_match,season,team,team2,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date
229,Mumbai,Indian Premier League,,336038.csv,,,0,SR Watson,2007/08,Delhi Daredevils,Rajasthan Royals,field,Delhi Daredevils,Wankhede Stadium,Rajasthan Royals,105.0,,2008-05-30
230,Mumbai,Indian Premier League,,336039.csv,,,0,M Ntini,2007/08,Chennai Super Kings,Kings XI Punjab,bat,Kings XI Punjab,Wankhede Stadium,Chennai Super Kings,,9.0,2008-05-31
231,Mumbai,Indian Premier League,,336040.csv,,,0,YK Pathan,2007/08,Chennai Super Kings,Rajasthan Royals,field,Rajasthan Royals,Dr DY Patil Sports Academy,Rajasthan Royals,,3.0,2008-06-01
286,Centurion,Indian Premier League,,392237.csv,,,1,AC Gilchrist,2009,Delhi Daredevils,Deccan Chargers,field,Deccan Chargers,SuperSport Park,Deccan Chargers,,6.0,2009-05-22
287,Johannesburg,Indian Premier League,,392238.csv,,,1,MK Pandey,2009,Royal Challengers Bangalore,Chennai Super Kings,field,Royal Challengers Bangalore,New Wanderers Stadium,Royal Challengers Bangalore,,6.0,2009-05-23
288,Johannesburg,Indian Premier League,,392239.csv,,,1,A Kumble,2009,Royal Challengers Bangalore,Deccan Chargers,field,Royal Challengers Bangalore,New Wanderers Stadium,Deccan Chargers,6.0,,2009-05-24
345,Mumbai,Indian Premier League,,419162.csv,,,0,KA Pollard,2009/10,Royal Challengers Bangalore,Mumbai Indians,bat,Mumbai Indians,Dr DY Patil Sports Academy,Mumbai Indians,35.0,,2010-04-21
346,Mumbai,Indian Premier League,,419163.csv,,,0,DE Bollinger,2009/10,Chennai Super Kings,Deccan Chargers,bat,Chennai Super Kings,Dr DY Patil Sports Academy,Chennai Super Kings,38.0,,2010-04-22
347,Mumbai,Indian Premier League,,419164.csv,,,0,A Kumble,2009/10,Royal Challengers Bangalore,Deccan Chargers,bat,Deccan Chargers,Dr DY Patil Sports Academy,Royal Challengers Bangalore,,9.0,2010-04-24
348,Mumbai,Indian Premier League,,419165.csv,,,0,SK Raina,2009/10,Chennai Super Kings,Mumbai Indians,bat,Chennai Super Kings,Dr DY Patil Sports Academy,Chennai Super Kings,22.0,,2010-04-25


<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
<ul>
<li>Indeed, all the matches with no match_number attribute correspond to knockout/playoff matches. IPL used to follow two semi-final format during the initial seasons. IPL officials later changed the structure of playoffs/knockouts favouring the top two teams that out-performed rest of the teams in the league stage of the game. Former(2 semi-final) format comprised of three matches after the league stage of IPL whereas the latter format resulted in 4 matches after the round-robin part of the league.</li>

<li>As the <strong>match_number</strong> is just a serial number we can remove <strong>match_number</strong> variable.</li>
<li>According to our exploration, <em><strong>null values in match_number represent knockout stage matches</strong></em>. We can retain this information in a new column before removing the match_number column.</li>
</ul>
</div>

In [19]:
# adding 'is_league_match' attribute to the data
metadata['is_league_match'] = meta['match_number'].isnull().astype(int)

#removing 'match_number' attribute from the data.
metadata.drop('match_number', axis=1, inplace=True)

<h2 style="text-align: center; font-size:2.5em;"> Eliminator </h2>

In [20]:
metadata.loc[np.logical_not(metadata.eliminator.isnull())]

Unnamed: 0,city,competition,eliminator,match_id,method,neutralvenue,player_of_match,season,team,team2,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date,is_league_match
240,Cape Town,Indian Premier League,Rajasthan Royals,392190.csv,,1,YK Pathan,2009,Kolkata Knight Riders,Rajasthan Royals,field,Kolkata Knight Riders,Newlands,tie,,,2009-04-23,0
304,Chennai,Indian Premier League,Kings XI Punjab,419121.csv,,0,J Theron,2009/10,Chennai Super Kings,Kings XI Punjab,field,Chennai Super Kings,"""MA Chidambaram Stadiu",tie,,,2010-03-21,0
502,Hyderabad,Indian Premier League,Sunrisers Hyderabad,598004.csv,,0,GH Vihari,2013,Sunrisers Hyderabad,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,"""Rajiv Gandhi International Stadiu",tie,,,2013-04-07,0
515,Bangalore,Indian Premier League,Royal Challengers Bangalore,598017.csv,,0,V Kohli,2013,Royal Challengers Bangalore,Delhi Daredevils,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,tie,,,2013-04-16,0
590,Abu Dhabi,Indian Premier League,Rajasthan Royals,729315.csv,,1,JP Faulkner,2014,Kolkata Knight Riders,Rajasthan Royals,bat,Rajasthan Royals,Sheikh Zayed Stadium,tie,,,2014-04-29,0
650,Ahmedabad,Indian Premier League,Kings XI Punjab,829741.csv,,0,SE Marsh,2015,Rajasthan Royals,Kings XI Punjab,field,Kings XI Punjab,"""Sardar Patel Stadiu",tie,,,2015-04-21,0
33,Rajkot,Indian Premier League,Mumbai Indians,1082625.csv,,0,KH Pandya,2017,Gujarat Lions,Mumbai Indians,bat,Gujarat Lions,Saurashtra Cricket Association Stadium,tie,,,2017-04-29,0
123,Delhi,Indian Premier League,Delhi Capitals,1175365.csv,,0,PP Shaw,2019,Delhi Capitals,Kolkata Knight Riders,field,Delhi Capitals,Feroz Shah Kotla,tie,,,2019-03-30,0
164,Mumbai,Indian Premier League,Mumbai Indians,1178426.csv,,0,JJ Bumrah,2019,Mumbai Indians,Sunrisers Hyderabad,bat,Mumbai Indians,Wankhede Stadium,tie,,,2019-05-02,0


In [21]:
for matchcsv in metadata.loc[np.logical_not(metadata.eliminator.isnull())].match_id.values:
    temp = pd.read_csv('balldata/'+matchcsv)
    print("Number of innings in {0} : {1}".format(matchcsv, temp.loc[:,'innings'].nunique()))

Number of innings in 392190.csv : 4
Number of innings in 419121.csv : 4
Number of innings in 598004.csv : 4
Number of innings in 598017.csv : 4
Number of innings in 729315.csv : 4
Number of innings in 829741.csv : 4
Number of innings in 1082625.csv : 4
Number of innings in 1175365.csv : 4
Number of innings in 1178426.csv : 4


<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
    <ul>
    <li>All the matches which have non-null value for <strong>eliminator</strong> represent the tie matches where the winner is decided by a super-over</li>
    <li>We can merge the <strong>winner</strong> column with <strong>eliminator</strong> column and have a new column <strong>'has_super_over'</strong> to show if the corresponding match led to a super-over ot not. </li>
    </ul>

In [22]:
#merging eliminator and winner attributes.

outcomes = []
for out, win in zip(metadata.eliminator.values, metadata.winner.values):
    if win=="tie":
        outcomes.append(out)
    else:
        outcomes.append(win)

metadata.loc[:, 'winner'] = outcomes
metadata['has_super_over'] = 1 - metadata.eliminator.isnull().astype(int)
metadata.drop('eliminator', axis=1, inplace=True)

In [23]:
metadata.has_super_over.value_counts()

0    742
1      9
Name: has_super_over, dtype: int64

<h2 style="text-align: center; font-size:2.5em;"> Method </h2>

In [24]:
print("Number of matches with 'method' attribute : {}".format(metadata.method.size - metadata.method.isnull().sum()))

Number of matches with 'method' attribute : 19


In [25]:
metadata.loc[np.logical_not(metadata.method.isnull())]

Unnamed: 0,city,competition,match_id,method,neutralvenue,player_of_match,season,team,team2,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date,is_league_match,has_super_over
214,Delhi,Indian Premier League,336022.csv,D/L,0,DPMD Jayawardene,2007/08,Delhi Daredevils,Kings XI Punjab,bat,Delhi Daredevils,Feroz Shah Kotla,Kings XI Punjab,6.0,,2008-05-17,0,0
217,Kolkata,Indian Premier League,336025.csv,D/L,0,M Ntini,2007/08,Kolkata Knight Riders,Chennai Super Kings,bat,Kolkata Knight Riders,Eden Gardens,Chennai Super Kings,3.0,,2008-05-18,0,0
234,Cape Town,Indian Premier League,392183.csv,D/L,1,DL Vettori,2009,Delhi Daredevils,Kings XI Punjab,field,Delhi Daredevils,Newlands,Delhi Daredevils,,10.0,2009-04-19,0,0
237,Durban,Indian Premier League,392186.csv,D/L,1,CH Gayle,2009,Kings XI Punjab,Kolkata Knight Riders,field,Kolkata Knight Riders,Kingsmead,Kolkata Knight Riders,11.0,,2009-04-21,0,0
263,Centurion,Indian Premier League,392214.csv,D/L,1,ML Hayden,2009,Chennai Super Kings,Kings XI Punjab,bat,Chennai Super Kings,SuperSport Park,Chennai Super Kings,12.0,,2009-05-07,0,0
366,Kochi,Indian Premier League,501215.csv,D/L,0,BB McCullum,2011,Kochi Tuskers Kerala,Chennai Super Kings,field,Kochi Tuskers Kerala,Nehru Stadium,Kochi Tuskers Kerala,,7.0,2011-04-18,0,0
395,Kolkata,Indian Premier League,501245.csv,D/L,0,Iqbal Abdulla,2011,Kolkata Knight Riders,Chennai Super Kings,bat,Chennai Super Kings,Eden Gardens,Kolkata Knight Riders,10.0,,2011-05-07,0,0
405,Bangalore,Indian Premier League,501255.csv,D/L,0,CH Gayle,2011,Royal Challengers Bangalore,Kolkata Knight Riders,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,,4.0,2011-05-14,0,0
603,Delhi,Indian Premier League,733993.csv,D/L,0,DW Steyn,2014,Delhi Daredevils,Sunrisers Hyderabad,field,Sunrisers Hyderabad,Feroz Shah Kotla,Sunrisers Hyderabad,,8.0,2014-05-10,0,0
651,Visakhapatnam,Indian Premier League,829743.csv,D/L,0,DA Warner,2015,Sunrisers Hyderabad,Kolkata Knight Riders,field,Kolkata Knight Riders,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,Sunrisers Hyderabad,16.0,,2015-04-22,0,0


In [26]:
metadata.method.value_counts()

D/L    19
Name: method, dtype: int64

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
<ul>
    <li>All the matches, where the final outcome is decided by <a href="https://en.wikipedia.org/wiki/Duckworth%E2%80%93Lewis%E2%80%93Stern_method" target="_blank"><b>DLS</b> method</a> has <b>method</b> attribute set to the value <b>"D/L"</b></li>
    <li>DLS method is only used, if rain(or any other disturbance) has delayed the match for significant time.</li>
    <li> Hence, <b>method</b> attribute can be converted into a <b>boolean holder</b></li>
</ul>
</div>

In [27]:
metadata.loc[:, 'method'] = np.logical_not(meta.method.isnull()).astype(int)
metadata.method.value_counts()

0    732
1     19
Name: method, dtype: int64

<h2 style="text-align: center; font-size:2.5em;"> Player_of_match </h2>

In [28]:
metadata.loc[metadata.player_of_match.isnull()]

Unnamed: 0,city,competition,match_id,method,neutralvenue,player_of_match,season,team,team2,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date,is_league_match,has_super_over
415,Delhi,Indian Premier League,501265.csv,0,0,,2011,Delhi Daredevils,Pune Warriors,bat,Delhi Daredevils,Feroz Shah Kotla,no result,,,2011-05-21,0,0
660,Bangalore,Indian Premier League,829763.csv,0,0,,2015,Royal Challengers Bangalore,Rajasthan Royals,field,Rajasthan Royals,M Chinnaswamy Stadium,no result,,,2015-04-29,0,0
685,Bangalore,Indian Premier League,829813.csv,0,0,,2015,Royal Challengers Bangalore,Delhi Daredevils,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,no result,,,2015-05-17,0,0
162,Bengaluru,Indian Premier League,1178424.csv,0,0,,2019,Royal Challengers Bangalore,Rajasthan Royals,field,Rajasthan Royals,M.Chinnaswamy Stadium,no result,,,2019-04-30,0,0


In [29]:
metadata.winner.value_counts()['no result']

4

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
<ul>
    <li>There are 4 matches with no 'player_of_match' attribute.</li>
    <li>Further inspection shows all these four matches have attribute <b>method</b> set to <b>"D/L"</b>.</li>
    <li>Rain wiped out the whole time allocated to the corresponding match. Hence we neither have a winner nor a player_of_match for such matches.</li>
</ul>
</div>

In [30]:
metadata.isnull().sum()

city                 0
competition          0
match_id             0
method               0
neutralvenue         0
player_of_match      4
season               0
team                 0
team2                0
toss_decision        0
toss_winner          0
venue                0
winner               0
winner_runs        414
winner_wickets     350
match_date           0
is_league_match      0
has_super_over       0
dtype: int64

<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
<div>Our data looks much clean now :)</div>
</div>

<h2 style="text-align: center; font-size:2.5em;"> Feature Augmentation </h2>
<div style="border: 1px solid black; padding: 1.5em; line-height:2; border-radius: 15px;">
    <div>This is the time to reinforce our analysis with some custom variables<div>
    <div>
        <div>New variables to add in our data:</div>
        <ul>
            <li>toss_looser :- Team which lost the toss.</li>
            <li>bat1 :- first batting team</li>
            <li>bat2 :- second batting team</li>
            <li>toss_winner_is_match_winner :- <b>True</b> if the team which won the <b>toss</b> eventually <b>wins the match</b></li>
            <li>score1 :- Runs scored by the team batting first.</li>
            <li>score2 :- Runs scored bt the team batting second.</li>
            <li>runrate_diff :- difference in runrate of both the teams</li>
            <li>balls1 :- Balls faced by team batting first</li>
            <li>balls2 :- Balls faced by team batting second</li>
            <li>mom_team :- Team of "player of match"</li>
        </ul>
    </div>
    <div>
        <div>But, before moving ahead let's make some changes to the structure of the data.</div>
        <ul>
            <li>Rename <b>"team"</b> and <b>"team2"</b> as <b>"home_team"</b> and <b>"away_team"</b> respectively</li>
            <li>Create a LabelEncoder mapping team name to an integer</li>
            <li>Encode team names according to the above created LabelEncoder in the whole dataset.</li>
            <li>Remove the redundant <b>competition</b> variable</li>
        </ul>
    </div>
</div>

In [31]:
metadata.rename({'team':'home_team', 'team2':'away_team'}, axis=1, inplace=True)

In [32]:
metadata.drop('competition', axis=1, inplace=True)

In [33]:
team_name_encoder = LabelEncoder()
team_name_encoder.fit(metadata.winner.values)
team_name_encoder.classes_.size

16

In [34]:
for idx,team in enumerate(team_name_encoder.classes_):
    print('{}. {}'.format(idx+1, team))

1. Chennai Super Kings
2. Deccan Chargers
3. Delhi Capitals
4. Delhi Daredevils
5. Gujarat Lions
6. Kings XI Punjab
7. Kochi Tuskers Kerala
8. Kolkata Knight Riders
9. Mumbai Indians
10. Pune Warriors
11. Rajasthan Royals
12. Rising Pune Supergiant
13. Rising Pune Supergiants
14. Royal Challengers Bangalore
15. Sunrisers Hyderabad
16. no result


In [35]:
metadata.replace({"Delhi Capitals": "Delhi Daredevils", "Rising Pune Supergiant": "Rising Pune Supergiants"}, inplace=True)

In [36]:
team_name_encoder = LabelEncoder()
team_name_encoder.fit(metadata.winner.values)
team_name_encoder.classes_.size

14

In [37]:
#encoding "team name"
#for col in ['home_team', 'away_team', 'toss_winner', 'winner']:
#   metadata[col] = team_name_encoder.transform(metadata[col])

In [38]:
loosers = []
for home, away, winner in zip(metadata.home_team.values, metadata.away_team.values, metadata.toss_winner.values):
    if winner == home:
        loosers.append(away)
    else:
        loosers.append(home)
        
metadata['toss_looser'] = loosers

In [39]:
bat1=[]
bat2=[]

for decision, winner, looser in zip(metadata.toss_decision.values, metadata.toss_winner.values, metadata.toss_looser.values):
    if decision == "bat":
        bat1.append(winner)
        bat2.append(looser)
    else:
        bat1.append(looser)
        bat2.append(winner)

metadata['bat1'] = bat1
metadata['bat2'] = bat2

In [40]:
metadata['toss_winner_is_winner'] = (metadata['toss_winner']==metadata['winner'])
metadata['toss_winner_is_winner'] = metadata['toss_winner_is_winner'].astype(int)

In [41]:
def score(match, innings):
    delivery_data = pd.read_csv('balldata/'+match)
    if innings > delivery_data['innings'].nunique():
        return np.nan
    
    delivery_data = delivery_data.groupby('innings').sum().loc[innings]
    return delivery_data['runs']+delivery_data['extras']
    
def extras(match, innings):
    delivery_data = pd.read_csv('balldata/'+match)
    if innings> delivery_data['innings'].nunique():
        return np.nan
    return delivery_data.groupby('innings').sum()['extras'][innings]

def delivery_count(match, innings):
    delivery_data = pd.read_csv('balldata/'+match)
    if innings > delivery_data['innings'].nunique():
        return np.nan
    return delivery_data.loc[delivery_data.innings==innings].shape[0]

def runrate(match, innings):
    runs = score(match, innings)
    delivery_data = pd.read_csv('balldata/'+match)
    if innings > delivery_data['innings'].nunique():
        return None
    
    lastball = delivery_data.loc[delivery_data.innings==innings]['over'].values[-1]
    legal_balls = (lastball//1)*6 + (lastball*10)%10
    
    return (6*runs)/legal_balls

In [42]:
start = time.time()
score1 = []
score2 = []
extras1 = []
extras2 = []
balls1 = []
balls2 =[]
runrate1 = []
runrate2 = []

for match in metadata.match_id.values:
    score1.append(score(match, 1))
    extras1.append(extras(match, 2))
    balls1.append(delivery_count(match, 1))
    runrate1.append(runrate(match, 1))
    
    score2.append(score(match, 2))
    extras2.append(extras(match, 1))
    balls2.append(delivery_count(match, 2))
    runrate2.append(runrate(match, 2))

metadata['score1'] = score1
metadata['score2'] = score2
metadata['extras1'] = extras1
metadata['extras2'] = extras2
metadata['balls1'] = balls1
metadata['balls2'] = balls2
metadata['runrate1'] = runrate1
metadata['runrate2'] = runrate2

print("Excecution Time : {0:.3f} s".format(time.time()-start))

Excecution Time : 26.221 s


In [43]:
metadata.head()

Unnamed: 0,city,match_id,method,neutralvenue,player_of_match,season,home_team,away_team,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date,is_league_match,has_super_over,toss_looser,bat1,bat2,toss_winner_is_winner,score1,score2,extras1,extras2,balls1,balls2,runrate1,runrate2
174,Bangalore,335982.csv,0,0,BB McCullum,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Kolkata Knight Riders,140.0,,2008-04-18,0,0,Kolkata Knight Riders,Kolkata Knight Riders,Royal Challengers Bangalore,0,222.0,82.0,19.0,17,124,101.0,11.1,5.347826
176,Delhi,335984.csv,0,0,MF Maharoof,2007/08,Delhi Daredevils,Rajasthan Royals,bat,Rajasthan Royals,Feroz Shah Kotla,Delhi Daredevils,,9.0,2008-04-19,0,0,Delhi Daredevils,Rajasthan Royals,Delhi Daredevils,0,129.0,132.0,10.0,7,122,97.0,6.45,8.703297
175,Chandigarh,335983.csv,0,0,MEK Hussey,2007/08,Kings XI Punjab,Chennai Super Kings,bat,Chennai Super Kings,"""Punjab Cricket Association Stadiu",Chennai Super Kings,33.0,,2008-04-19,0,0,Kings XI Punjab,Chennai Super Kings,Kings XI Punjab,1,240.0,207.0,11.0,6,124,124.0,12.0,10.35
177,Mumbai,335985.csv,0,0,MV Boucher,2007/08,Mumbai Indians,Royal Challengers Bangalore,bat,Mumbai Indians,Wankhede Stadium,Royal Challengers Bangalore,,5.0,2008-04-20,0,0,Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,0,165.0,166.0,5.0,11,123,123.0,8.25,8.369748
178,Kolkata,335986.csv,0,0,DJ Hussey,2007/08,Kolkata Knight Riders,Deccan Chargers,bat,Deccan Chargers,Eden Gardens,Kolkata Knight Riders,,5.0,2008-04-20,0,0,Kolkata Knight Riders,Deccan Chargers,Kolkata Knight Riders,0,110.0,112.0,28.0,10,118,122.0,5.892857,5.894737


In [44]:
metadata.index = np.arange(metadata.shape[0]).astype(int)

In [45]:
metadata.head()

Unnamed: 0,city,match_id,method,neutralvenue,player_of_match,season,home_team,away_team,toss_decision,toss_winner,venue,winner,winner_runs,winner_wickets,match_date,is_league_match,has_super_over,toss_looser,bat1,bat2,toss_winner_is_winner,score1,score2,extras1,extras2,balls1,balls2,runrate1,runrate2
0,Bangalore,335982.csv,0,0,BB McCullum,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,field,Royal Challengers Bangalore,M Chinnaswamy Stadium,Kolkata Knight Riders,140.0,,2008-04-18,0,0,Kolkata Knight Riders,Kolkata Knight Riders,Royal Challengers Bangalore,0,222.0,82.0,19.0,17,124,101.0,11.1,5.347826
1,Delhi,335984.csv,0,0,MF Maharoof,2007/08,Delhi Daredevils,Rajasthan Royals,bat,Rajasthan Royals,Feroz Shah Kotla,Delhi Daredevils,,9.0,2008-04-19,0,0,Delhi Daredevils,Rajasthan Royals,Delhi Daredevils,0,129.0,132.0,10.0,7,122,97.0,6.45,8.703297
2,Chandigarh,335983.csv,0,0,MEK Hussey,2007/08,Kings XI Punjab,Chennai Super Kings,bat,Chennai Super Kings,"""Punjab Cricket Association Stadiu",Chennai Super Kings,33.0,,2008-04-19,0,0,Kings XI Punjab,Chennai Super Kings,Kings XI Punjab,1,240.0,207.0,11.0,6,124,124.0,12.0,10.35
3,Mumbai,335985.csv,0,0,MV Boucher,2007/08,Mumbai Indians,Royal Challengers Bangalore,bat,Mumbai Indians,Wankhede Stadium,Royal Challengers Bangalore,,5.0,2008-04-20,0,0,Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,0,165.0,166.0,5.0,11,123,123.0,8.25,8.369748
4,Kolkata,335986.csv,0,0,DJ Hussey,2007/08,Kolkata Knight Riders,Deccan Chargers,bat,Deccan Chargers,Eden Gardens,Kolkata Knight Riders,,5.0,2008-04-20,0,0,Kolkata Knight Riders,Deccan Chargers,Kolkata Knight Riders,0,110.0,112.0,28.0,10,118,122.0,5.892857,5.894737


In [46]:
print("notebook excecution time : ", end="")
print("{:.3f} s".format(time.time() - notebook_start_time))

notebook excecution time : 51.602 s


In [47]:
metadata.to_csv('metadata/metadata.csv', index=False)