In [1]:
import pandas as pd
import numpy as np

In [2]:
delivery = pd.read_csv('deliveries.csv')
match = pd.read_csv('matches.csv')

In [3]:
delivery.sample(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
81226,344,2,Royal Challengers Bangalore,Kolkata Knight Riders,11,1,CH Gayle,SS Tiwary,R Bhatia,0,...,0,0,0,0,0,0,0,,,
12267,52,2,Sunrisers Hyderabad,Gujarat Lions,9,1,DA Warner,V Shankar,MM Patel,0,...,0,0,0,0,4,0,4,,,
24660,106,2,Mumbai Indians,Kings XI Punjab,17,3,SR Tendulkar,RV Uthappa,IK Pathan,0,...,0,0,0,0,1,0,1,,,


In [4]:
match.sample(3)

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
43,44,IPL-2017,Delhi,06-05-2017,Mumbai Indians,Delhi Daredevils,Delhi Daredevils,field,normal,0,Mumbai Indians,146,0,LMP Simmons,Feroz Shah Kotla,Nitin Menon,CK Nandan,
671,7929,IPL-2018,Hyderabad,05-05-2018,Delhi Daredevils,Sunrisers Hyderabad,Delhi Daredevils,bat,normal,0,Sunrisers Hyderabad,0,7,Rashid Khan,"Rajiv Gandhi International Stadium, Uppal",Bruce Oxenford,O Nandan,Virender Kumar Sharma
561,562,IPL-2015,Mumbai,10-05-2015,Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,39,0,AB de Villiers,Wankhede Stadium,JD Cloete,C Shamshuddin,


In [5]:
## Calculating the Total runs for each matches

In [6]:
total_score_df = delivery.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()

total_score_df.head(3)

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
1,1,2,172
2,2,1,184


In [7]:
## Now we only want the data of the 1st innings to train the model so we would remove inning = 2

total_score_df = total_score_df[total_score_df['inning'] == 1]

In [8]:
## Now we will merge this data with the match dataset based on id and match_id

match_df = match.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

match_df.head(4)

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3,match_id,total_runs
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,,1,207
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,,2,184
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,,3,183
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,4,163


In [9]:
match_df['team1'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [10]:
## we will remove the teams which are not a part of the IPL any more

teams = [
    'Surisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Banglore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

In [11]:
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [12]:
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

In [13]:
match_df.shape

(335, 20)

In [14]:
## Finding how many matches were affected by rain and removing it

match_df['dl_applied'].value_counts()

0    326
1      9
Name: dl_applied, dtype: int64

In [15]:
match_df = match_df[match_df['dl_applied'] == 0]

In [16]:
match_df.shape

(326, 20)

In [17]:
## Now merging useful information from match_df to delivery

match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]

In [18]:
delivery_df = match_df.merge(delivery, on='match_id')

In [19]:
delivery_df.head(4)

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder
0,7,Mumbai,Mumbai Indians,178,1,Kolkata Knight Riders,Mumbai Indians,1,1,G Gambhir,...,0,0,0,0,1,0,1,,,
1,7,Mumbai,Mumbai Indians,178,1,Kolkata Knight Riders,Mumbai Indians,1,2,CA Lynn,...,0,0,0,0,0,1,1,,,
2,7,Mumbai,Mumbai Indians,178,1,Kolkata Knight Riders,Mumbai Indians,1,3,CA Lynn,...,0,0,0,0,1,0,1,,,
3,7,Mumbai,Mumbai Indians,178,1,Kolkata Knight Riders,Mumbai Indians,1,4,G Gambhir,...,0,0,0,0,0,0,0,,,


In [20]:
delivery_df = delivery_df[delivery_df['inning'] == 2]

In [21]:
delivery_df.shape

(37882, 24)

In [22]:
delivery_df.head(2)

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder
128,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,1,PA Patel,...,0,0,0,0,0,0,0,,,
129,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,2,PA Patel,...,0,0,0,0,0,0,0,,,


In [23]:
## Making a new column named current score

delivery_df['current_score'] = delivery_df.groupby('match_id').cumsum()['total_runs_y']

In [24]:
delivery_df.head(7)

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score
128,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,1,PA Patel,...,0,0,0,0,0,0,,,,0
129,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,2,PA Patel,...,0,0,0,0,0,0,,,,0
130,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,3,PA Patel,...,0,0,0,4,0,4,,,,4
131,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,4,PA Patel,...,0,0,0,0,0,0,,,,4
132,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,5,PA Patel,...,0,0,0,1,0,1,,,,5
133,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,6,JC Buttler,...,0,0,0,4,0,4,,,,9
134,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,2,1,PA Patel,...,0,0,0,0,1,1,,,,10


In [25]:
## Now making a new coulumn named runs left

delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score']

In [26]:
## Now making a new column named balls left

delivery_df['balls_left'] = 126 - (delivery_df['over']*6 + delivery_df['ball'])

In [27]:
delivery_df.head(5)

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left
128,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,1,PA Patel,...,0,0,0,0,,,,0,178,119
129,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,2,PA Patel,...,0,0,0,0,,,,0,178,118
130,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,3,PA Patel,...,0,4,0,4,,,,4,174,117
131,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,4,PA Patel,...,0,0,0,0,,,,4,174,116
132,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,5,PA Patel,...,0,1,0,1,,,,5,173,115


In [28]:
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x:x if x == "0" else "1")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')
wickets = delivery_df.groupby('match_id').cumsum()['player_dismissed'].values
delivery_df['wickets'] = 10 - wickets
delivery_df.head()

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets
128,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,1,PA Patel,...,0,0,0,0,,,0,178,119,10
129,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,2,PA Patel,...,0,0,0,0,,,0,178,118,10
130,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,3,PA Patel,...,4,0,4,0,,,4,174,117,10
131,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,4,PA Patel,...,0,0,0,0,,,4,174,116,10
132,7,Mumbai,Mumbai Indians,178,2,Mumbai Indians,Kolkata Knight Riders,1,5,PA Patel,...,1,0,1,0,,,5,173,115,10


In [29]:
## current runrate

delivery_df['crr'] = (delivery_df['current_score']*6)/(120 - delivery_df['balls_left'])

In [30]:
## required run rate

delivery_df['rrr'] = (delivery_df['runs_left']*6)/(delivery_df['balls_left'])

In [31]:
delivery_df.shape

(37882, 30)

In [32]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [33]:
delivery_df['result'] = delivery_df.apply(result, axis=1)

In [34]:
final_df = delivery_df[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left', 'wickets', 'total_runs_x', 'crr', 'rrr', 'result']]

In [35]:
## shuffle the data

final_df = final_df.sample(final_df.shape[0])

In [36]:
final_df.sample(5)

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets,total_runs_x,crr,rrr,result
185,Mumbai Indians,Kolkata Knight Riders,Mumbai,107,68,8,178,8.192308,9.441176,1
36863,Kolkata Knight Riders,Chennai Super Kings,Chennai,182,110,9,190,4.8,9.927273,1
6491,Chennai Super Kings,Delhi Daredevils,Delhi,135,88,10,187,9.75,9.204545,1
29970,Delhi Daredevils,Chennai Super Kings,Delhi,59,81,8,110,7.846154,4.37037,1
30222,Rajasthan Royals,Mumbai Indians,Mumbai,61,30,5,197,9.066667,12.2,0


In [37]:
final_df = final_df[final_df['balls_left'] != 0]

In [38]:
final_df.dropna(inplace=True)

In [39]:
## Implementing the model

In [40]:
X = final_df.iloc[:, :-1]
y = final_df.iloc[:, -1]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [43]:
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])

In [44]:
pipe.fit(X_train, y_train)

In [45]:
y_pred = pipe.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8402406417112299

In [47]:
teams

['Surisers Hyderabad',
 'Mumbai Indians',
 'Royal Challengers Banglore',
 'Kolkata Knight Riders',
 'Kings XI Punjab',
 'Chennai Super Kings',
 'Rajasthan Royals',
 'Delhi Capitals']

In [48]:
delivery_df['city'].unique()

array(['Mumbai', 'Kolkata', 'Delhi', 'Indore', 'Chandigarh', 'Bangalore',
       'Jaipur', 'Chennai', 'Cape Town', 'Durban', 'Port Elizabeth',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Dharamsala', 'Pune', 'Raipur',
       'Abu Dhabi', nan, 'Sharjah', 'Ranchi', 'Cuttack', 'Visakhapatnam',
       'Mohali', 'Hyderabad'], dtype=object)

In [49]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

In [50]:
import pickle