In [None]:
import numpy as np
import pandas as pd


: 

Importing Data

In [None]:
match = pd.read_csv('matches.csv')
delivery = pd.read_csv('deliveries.csv')

A Glance at match datset

In [None]:
match.head()

In [None]:
match.describe(include='all')

In [None]:
print(match.shape)
match.tail()

A Glance at delivery dataset

In [None]:
delivery.head()

In [None]:
delivery.describe()

In [None]:
delivery.shape

Calculating total score after the completion of an innning

In [None]:
total_score_df = delivery.groupby(['match_id','inning']).sum()['total_runs'].reset_index()

total_score_df

In [None]:
total_score_df = total_score_df[total_score_df['inning'] == 1]

In [None]:
total_score_df

Combining the match dataframe to the total _score_df for the runs scored in the first inning

In [None]:
match_df = match.merge(total_score_df[['match_id','total_runs']],left_on='id',right_on='match_id')

In [None]:
match_df.head()

Data Preprocessing


Checking the teams that has played IPL and eliminating those who donot play anymore.

In [None]:
match_df['team1'].unique()

In [None]:
# Teams playing the present IPL season

teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bengaluru',
    'Kolkata Knight Riders',
    'Punjab Kings',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals',
    'Gujarat Titans',
    'Lucknow Super Giants'
]

In [None]:
# Changing previous names of team with present names

match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

match_df['team1'] = match_df['team1'].str.replace('Royal Challengers Bangalore','Royal Challengers Bengaluru')
match_df['team2'] = match_df['team2'].str.replace('Royal Challengers Bangalore','Royal Challengers Bengaluru')

match_df['team1'] = match_df['team1'].str.replace('Kings XI Punjab','Punjab Kings')
match_df['team2'] = match_df['team2'].str.replace('Kings XI Punjab','Punjab Kings')


In [None]:
# This checks if the team listed in the team1, team2 columns are present in the list teams.

match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

In [None]:
match_df.shape

In [None]:
match_df = match_df[['match_id','city','winner','total_runs']]
match_df

In [None]:
# Merging match_df(with required/relevent features) with delivery_df using match_id as key 

delivery_df = match_df.merge(delivery,on='match_id')

In [None]:
delivery_df = delivery_df[delivery_df['inning'] == 2]
delivery_df 

Feature Engineering

Creating features such as:
1. current_score after every ball bowled.
2. runs_left, balls_left and wickets_left after every ball while chasing.
3. Current and required runrate.

In [None]:
# calculate a running total of total_runs for each ball (i.e., cumulative score up to that ball).

delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs_y'].cumsum()

In [None]:
delivery_df['runs_left'] = (delivery_df['total_runs_x'] + 1) - delivery_df['current_score']

In [None]:
delivery_df['balls_left'] = 120 - (delivery_df['over']*6 + delivery_df['ball'])

In [None]:
wickets = delivery_df['wickets_left'] = delivery_df.groupby('match_id')['is_wicket'].cumsum()

delivery_df['wickets_left'] = 10 - wickets
delivery_df.head(50)

In [None]:
# crr = runs/overs

delivery_df['crr'] = (delivery_df['current_score']*6)/(120 - delivery_df['balls_left'])


In [None]:
# rrr = (runs required)/(overs left)

delivery_df['rrr'] = (delivery_df['runs_left']*6)/delivery_df['balls_left']


In [None]:
delivery_df.tail(70)

In [None]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [None]:
delivery_df['result'] = delivery_df.apply(result,axis=1)
delivery_df.head()

Final dataset for model training

In [None]:
final_df = delivery_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets_left','total_runs_x','crr','rrr','result']]

In [None]:
# Shuffling the data

final_df = final_df.sample(final_df.shape[0])

In [None]:
final_df.sample()

In [None]:
final_df.dropna(inplace=True)

In [None]:
final_df.isnull().sum()

In [None]:
final_df = final_df[final_df['balls_left'] != 0]

Test-Train split for model training

In [None]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
X_train

Appling one hot encoder for categorical variables

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

Model: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
# probability at a particular ball
pipe.predict_proba(X_test)[1]

In [None]:
def match_summary(row):
    print("Batting Team-" + row['batting_team'] + " | Bowling Team-" + row['bowling_team'] + " | Target- " + str(row['total_runs_x']))
    

In [None]:
import pickle
pickle.dump(pipe,open('pipe3.pkl','wb'))
