
# CRICKET AND CODING - CRACK THE CHALLENGE 
## Problem Statement:

    Given certain input parameters regarding an innings of a T20 cricket match, predict the total runs scored by the batting team at the end of 6 overs.

## Input Data:
    the link to the Dataset (Source: cricsheet.org) which contains historic data of T20 matches that have occurred in the past. [https://cricsheet.org/downloads/ipl_male_csv2.zip] 



# 1. Download IPL Dataset

In [1]:
import requests, zipfile, io,glob,shutil,json
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from pprint import pprint

import pandas as pd

In [2]:
data_path = Path(Path.cwd(),'data')

if not data_path.exists():
    # https://internalapp.nptel.ac.in/contest/reference_docs/ipl_csv2.zip
    # https://cricsheet.org/downloads/ipl_csv.zip
    # https://cricsheet.org/downloads/ipl_male_csv2.zip
    res = requests.get(r'https://internalapp.nptel.ac.in/contest/reference_docs/ipl_csv2.zip',stream=True)
    if res.status_code == 200:
        print('### Downloading the Dataset')
        z = zipfile.ZipFile(io.BytesIO(res.content))
        z.extractall(data_path)




In [23]:
df = pd.read_csv('data\\all_matches.csv',parse_dates = ['start_date'])
#df_orig = df
print(df.columns)
df.head()

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [35]:
df.loc[df['start_date'].dt.year == 2021].groupby('batting_team')['striker'].unique().to_dict()

{'Chennai Super Kings': array(['RD Gaikwad', 'F du Plessis', 'MM Ali', 'SK Raina', 'AT Rayudu',
        'RA Jadeja', 'MS Dhoni', 'SM Curran'], dtype=object),
 'Delhi Capitals': array(['PP Shaw', 'S Dhawan', 'RR Pant', 'MP Stoinis'], dtype=object),
 'Kolkata Knight Riders': array(['N Rana', 'Shubman Gill', 'RA Tripathi', 'AD Russell',
        'EJG Morgan', 'KD Karthik', 'Shakib Al Hasan'], dtype=object),
 'Mumbai Indians': array(['RG Sharma', 'CA Lynn', 'SA Yadav', 'Ishan Kishan', 'HH Pandya',
        'KA Pollard', 'KH Pandya', 'M Jansen', 'JJ Bumrah'], dtype=object),
 'Royal Challengers Bangalore': array(['Washington Sundar', 'V Kohli', 'RM Patidar', 'GJ Maxwell',
        'AB de Villiers', 'Shahbaz Ahmed', 'DT Christian', 'KA Jamieson',
        'HV Patel', 'Mohammed Siraj'], dtype=object),
 'Sunrisers Hyderabad': array(['WP Saha', 'DA Warner', 'MK Pandey', 'JM Bairstow',
        'Mohammad Nabi', 'V Shankar', 'Abdul Samad'], dtype=object)}

# 2. Clean the dataset
1. Filter the current seasion -2021 match details (which we use to test)
1. Drop columns :  'season','start_date'
2. Delete the non existing teams : 
        'Kochi Tuskers Kerala' 'Pune Warriors','Rising Pune Supergiants', 'Rising Pune Supergiant','Gujarat Lions'
        
3. Replace the OLD team names with new team name
4. Replace the same venue details with unique names 
5. Can remove the innings >2 details 


In [4]:
#df = df.loc[df['start_date'].dt.year < 2021]



In [14]:
# 1. Drop columns 
df = df.drop(columns=['season','start_date']) #,inplace=True)

# 2. Delete Non-existing teams : 'Kochi Tuskers Kerala' 'Pune Warriors','Rising Pune Supergiants', 'Rising Pune Supergiant','Gujarat Lions'

mask_bat_team = df['batting_team'].isin(['Kochi Tuskers Kerala',
                                'Pune Warriors',
                                'Rising Pune Supergiants',
                                'Rising Pune Supergiant',
                                'Gujarat Lions'                                
                            ])

mask_bow_team = df['bowling_team'].isin(['Kochi Tuskers Kerala',
                                'Pune Warriors',
                                'Rising Pune Supergiants',
                                'Rising Pune Supergiant',
                                'Gujarat Lions'                                
                            ])
df = df[~mask_bat_team]
df = df[~mask_bow_team]
print(df.shape)

#3. Replace the old team names with new team name:
df.loc[df.batting_team == 'Delhi Daredevils','batting_team'] = 'Delhi Capitals'
df.loc[df.batting_team == 'Deccan Chargers','batting_team'] = 'Sunrisers Hyderabad'

df.loc[df.bowling_team == 'Delhi Daredevils','bowling_team'] = 'Delhi Capitals'
df.loc[df.bowling_team == 'Deccan Chargers','bowling_team'] = 'Sunrisers Hyderabad'



# 4. Replace venue column unique names :
df.loc[df.venue == 'M.Chinnaswamy Stadium','venue'] = 'M Chinnaswamy Stadium'
df.loc[df.venue == 'Punjab Cricket Association IS Bindra Stadium, Mohali','venue'] = 'Punjab Cricket Association Stadium'
df.loc[df.venue == 'Punjab Cricket Association IS Bindra Stadium','venue'] = 'Punjab Cricket Association Stadium'
df.loc[df.venue == 'Wankhede Stadium, Mumbai','venue'] = 'Wankhede Stadium'
df.loc[df.venue == 'Rajiv Gandhi International Stadium, Uppal','venue'] = 'Rajiv Gandhi International Stadium'
df.loc[df.venue == 'MA Chidambaram Stadium, Chepauk' ,'venue'] = 'MA Chidambaram Stadium'
df.loc[df.venue == 'MA Chidambaram Stadium, Chepauk, Chennai' ,'venue'] = 'MA Chidambaram Stadium'

pprint('### Total {} : venue details present '.format(len(df.venue.unique())))
pprint('### Total {}  : Batting teams are there'.format(len(df.batting_team.unique())))
pprint('### Total {}  : Bowlling teams are there'.format(len(df.bowling_team.unique())))

print(df.shape)

  df = df[~mask_bow_team]
(167335, 20)
'### Total 33 : venue details present '
'### Total 8  : Batting teams are there'
'### Total 8  : Bowlling teams are there'
(167335, 20)


In [20]:
#pd.DataFrame(df.groupby(['batting_team'])['striker'].unique())
df.groupby(['bowling_team'])['bowler'].unique().to_dict()

{'Chennai Super Kings': array(['JDP Oram', 'MS Gony', 'M Muralitharan', 'P Amarnath',
        'Joginder Sharma', 'JA Morkel', 'M Ntini', 'SK Raina', 'S Vidyut',
        'L Balaji', 'CK Kapugedera', 'T Thushara', 'A Flintoff', 'S Tyagi',
        'SB Jakati', 'R Ashwin', 'JM Kemp', 'NLTC Perera', 'C Ganapathy',
        'DE Bollinger', 'TG Southee', 'S Randiv', 'SB Styris', 'DJ Bravo',
        'RA Jadeja', 'F du Plessis', 'KMDN Kulasekara', 'VY Mahesh',
        'BW Hilfenhaus', 'DP Nannes', 'AS Rajpoot', 'B Laughlin',
        'CH Morris', 'MM Sharma', 'JO Holder', 'A Nehra', 'P Negi',
        'DR Smith', 'IC Pandey', 'S Badree', 'V Shankar', 'DJ Hussey',
        'JW Hastings', 'RG More', 'DL Chahar', 'Harbhajan Singh',
        'Imran Tahir', 'SN Thakur', 'MJ Santner', 'SC Kuggeleijn',
        'KV Sharma', 'SR Watson', 'MA Wood', 'L Ngidi', 'KM Asif',
        'DJ Willey', 'SM Curran', 'PP Chawla', 'JR Hazlewood',
        'Monu Kumar', 'MM Ali'], dtype=object),
 'Delhi Capitals': array(['GD

In [19]:
list(df.venue.unique())

['M Chinnaswamy Stadium',
 'Punjab Cricket Association Stadium, Mohali',
 'Feroz Shah Kotla',
 'Eden Gardens',
 'Wankhede Stadium',
 'Sawai Mansingh Stadium',
 'Rajiv Gandhi International Stadium',
 'MA Chidambaram Stadium',
 'Dr DY Patil Sports Academy',
 'Newlands',
 "St George's Park",
 'Kingsmead',
 'SuperSport Park',
 'Buffalo Park',
 'New Wanderers Stadium',
 'De Beers Diamond Oval',
 'OUTsurance Oval',
 'Brabourne Stadium',
 'Sardar Patel Stadium, Motera',
 'Barabati Stadium',
 'Vidarbha Cricket Association Stadium, Jamtha',
 'Himachal Pradesh Cricket Association Stadium',
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
 'Subrata Roy Sahara Stadium',
 'Shaheed Veer Narayan Singh International Stadium',
 'JSCA International Stadium Complex',
 'Sheikh Zayed Stadium',
 'Sharjah Cricket Stadium',
 'Dubai International Cricket Stadium',
 'Maharashtra Cricket Association Stadium',
 'Punjab Cricket Association Stadium',
 'Holkar Cricket Stadium',
 'Arun Jaitley Stadium']

### Replace the team name in short

In [6]:

df.loc[df.batting_team == 'Kolkata Knight Riders', 'batting_team'] = 'KKR'
df.loc[df.batting_team == 'Royal Challengers Bangalore', 'batting_team'] = 'RCB'
df.loc[df.batting_team == 'Chennai Super Kings', 'batting_team'] = 'CSK'
df.loc[df.batting_team == 'Kings XI Punjab', 'batting_team'] = 'KXIP'
df.loc[df.batting_team == 'Rajasthan Royals', 'batting_team'] = 'RR'
df.loc[df.batting_team == 'Delhi Capitals', 'batting_team'] = 'DC'
df.loc[df.batting_team == 'Sunrisers Hyderabad', 'batting_team'] = 'SRH'
df.loc[df.batting_team == 'Mumbai Indians', 'batting_team'] = 'MI'

df.loc[df.bowling_team == 'Kolkata Knight Riders', 'bowling_team'] = 'KKR'
df.loc[df.bowling_team == 'Royal Challengers Bangalore', 'bowling_team'] = 'RCB'
df.loc[df.bowling_team == 'Chennai Super Kings', 'bowling_team'] = 'CSK'
df.loc[df.bowling_team == 'Kings XI Punjab', 'bowling_team'] = 'KXIP'
df.loc[df.bowling_team == 'Rajasthan Royals', 'bowling_team'] = 'RR'
df.loc[df.bowling_team == 'Delhi Capitals', 'bowling_team'] = 'DC'
df.loc[df.bowling_team == 'Sunrisers Hyderabad', 'bowling_team'] = 'SRH'
df.loc[df.bowling_team == 'Mumbai Indians', 'bowling_team'] = 'MI'

df.batting_team.unique()

array(['KKR', 'RCB', 'CSK', 'KXIP', 'RR', 'DC', 'SRH', 'MI'], dtype=object)

# Rename striker and bowler column names:


In [7]:
df = df.rename(columns= {'striker' : 'batsmen','non_striker' : 'batsmen_non_striker','bowler' : 'bowlers'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167335 entries, 0 to 194353
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   match_id                167335 non-null  int64  
 1   venue                   167335 non-null  object 
 2   innings                 167335 non-null  int64  
 3   ball                    167335 non-null  float64
 4   batting_team            167335 non-null  object 
 5   bowling_team            167335 non-null  object 
 6   batsmen                 167335 non-null  object 
 7   batsmen_non_striker     167335 non-null  object 
 8   bowlers                 167335 non-null  object 
 9   runs_off_bat            167335 non-null  int64  
 10  extras                  167335 non-null  int64  
 11  wides                   5119 non-null    float64
 12  noballs                 674 non-null     float64
 13  byes                    450 non-null     float64
 14  legbyes             

In [8]:
df['Total_score']  = df.runs_off_bat + df.extras
df = df.drop(columns = ['wides','noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'other_wicket_type', 'other_player_dismissed'],axis=1)
df[(df.ball<6.0) & (df.innings < 3)].to_csv('1_cleaned_Data.csv',index=False)


In [9]:
df = pd.read_csv('1_cleaned_Data.csv')
df.shape

(52597, 13)

In [None]:
label_encode_dict = {}

le = LabelEncoder()
le.fit(df.batsmen)
batsmen_e = le.transform(df.batsmen)
batsmen_e_inv = le.inverse_transform(batsmen_e)

label_encode_dict['batsmen'] = dict(zip(batsmen_e_inv,map(int,batsmen_e)))

le.fit(df.bowlers)
bowlers_e = le.transform(df.bowlers)
bowlers_e_inv = le.inverse_transform(bowlers_e)

label_encode_dict['bowlers'] = dict(zip(bowlers_e_inv,map(int,bowlers_e)))

le.fit(df.venue)
venue_e = le.transform(df.venue)
venue_e_inv = le.inverse_transform(venue_e)

label_encode_dict['venue'] = dict(zip(venue_e_inv,map(int,venue_e)))


le.fit(df.batting_team)
batting_team_e = le.transform(df.batting_team)
batting_team_e_inv = le.inverse_transform(batting_team_e)

label_encode_dict['batting_team'] = dict(zip(batting_team_e_inv,map(int,batting_team_e)))

le.fit(df.bowling_team)
bowling_team_e = le.transform(df.bowling_team)
bowling_team_e_inv = le.inverse_transform(bowling_team_e)

label_encode_dict['bowling_team'] = dict(zip(bowling_team_e_inv,map(int,bowling_team_e)))



with open('label_encode.json','w') as f:
    json.dump(label_encode_dict,f)