## Modelling the Data

In [1]:
import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

### - Load the data

In [2]:
df = pd.read_csv('all_matches.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,


## Feature Engineering 

### - Subsetting the playing teams

In [4]:
df['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Deccan Chargers', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings'],
      dtype=object)

In [5]:
teams_playing = ['Royal Challengers Bangalore', 'Chennai Super Kings','Kolkata Knight Riders', 'Mumbai Indians','Sunrisers Hyderabad', 'Kings XI Punjab','Rajasthan Royals','Delhi Capitals','Punjab Kings']

In [6]:
df = df[df['batting_team'].isin(teams_playing)]

In [7]:
df = df[df['bowling_team'].isin(teams_playing)]

In [8]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,


In [9]:
# Sort the data according to match_id and innings

df = df.sort_values(['match_id','innings'], ascending=[True,True])

In [10]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,


### - Seperating over from ball column

In [11]:
# Since there is no over number and ball number column in the dataframe, we need to get the data from ball column

df['overs'] = df['ball']

df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,,0.1


In [12]:
df['over'] = df['ball'].astype(str)

In [13]:
df['over'] = df['over'].str[0]

df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,,0.1,0


In [14]:
df['ball'] = df['ball'].astype(str)

In [15]:
df['ball'] = df['ball'].str[2]

df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,,0.1,0


### - Calculating the total runs

In [16]:
# Total runs is batsman runs plus extras by the bowler

df['total_runs'] = df['runs_off_bat'] + df['extras']

In [17]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over,total_runs
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,,0.1,0,1


In [18]:
# Get the required column

df = df[['match_id','season','innings','batting_team','bowling_team','overs','over','ball','total_runs','player_dismissed']]

In [19]:
df.head(10)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,
1,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.2,0,2,0,
2,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.3,0,3,1,
3,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.4,0,4,0,
4,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.5,0,5,0,
5,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.6,0,6,0,
6,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.7,0,7,1,
7,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.1,1,1,0,
8,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.2,1,2,4,
9,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.3,1,3,4,


In [20]:
# Replacing the Nan with 0 to get the wickets data

df = df.replace(np.nan,0)

In [21]:
df.head(10)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0
1,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.2,0,2,0,0
2,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.3,0,3,1,0
3,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.4,0,4,0,0
4,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.5,0,5,0,0
5,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.6,0,6,0,0
6,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.7,0,7,1,0
7,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.1,1,1,0,0
8,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.2,1,2,4,0
9,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,1.3,1,3,4,0


### Total runs scored per innings :

In [22]:
df['total_team_score'] = df.groupby(['innings','match_id'])['total_runs'].transform('sum')

In [23]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222


### Runs scored till the curent ball :

In [24]:
df['total_score'] = df.groupby(['innings','match_id'])['total_runs'].apply(lambda x : x.cumsum())

In [25]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1


### No.of wickets till the current ball :

In [26]:
df['player_dismissed']=np.where(df['player_dismissed']==0, 0, 1)

In [27]:
df['total_wickets'] = df.groupby(['innings','match_id'])['player_dismissed'].apply(lambda x : x.cumsum())

In [28]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0


### Runs scored in previous 5 overs / 30 balls :

In [29]:
tmp= df.groupby(['innings','match_id'])['total_runs'].rolling(min_periods=1,window=30).sum().reset_index()

df['prev_30_runs'] = tmp['total_runs'].to_list()

In [30]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1.0


### Wickets fallen in previous 5 overs / 30 balls :

In [31]:
tmp= df.groupby(['innings','match_id'])['player_dismissed'].rolling(min_periods=1,window=30).sum().reset_index()

df['prev_30_wickets'] = tmp['player_dismissed'].to_list()

In [32]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1.0,0.0


### Dot balls in previous 5 overs / 30 balls :

In [33]:
df['prev_30_dot_balls']=df['total_runs']
df['prev_30_dot_balls']=np.where(df['prev_30_dot_balls']==0, 1, 0)

tmp=df.groupby(['match_id','innings'])['prev_30_dot_balls'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_dot_balls']=tmp['prev_30_dot_balls'].to_list()

In [34]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1.0,0.0,0.0


### Boundaries in previous 5 overs/ 30 balls :

In [35]:
df['prev_30_boundaries'] = df['total_runs']

In [36]:
df['prev_30_boundaries'] = np.where(df['total_runs'] > 3,1,0)

In [37]:
tmp = df.groupby(['match_id','innings'])['prev_30_boundaries'].rolling(min_periods=1,window=30).sum().reset_index()

In [38]:
df['prev_30_boundaries'] = tmp['prev_30_boundaries'].to_list()

In [39]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1.0,0.0,0.0,0.0


In [40]:
convert_column = {'prev_30_runs':int,'prev_30_wickets':int,'prev_30_dot_balls':int,'prev_30_boundaries':int}

In [41]:
df = df.astype(convert_column)

In [42]:
df.head(1)

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1,0,0,0


# Score Prediction using Machine Learing 

### - Import the libraries

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [44]:
df.head()

Unnamed: 0,match_id,season,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries
0,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.1,0,1,1,0,222,1,0,1,0,0,0
1,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.2,0,2,0,0,222,1,0,1,0,1,0
2,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.3,0,3,1,0,222,2,0,2,0,1,0
3,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.4,0,4,0,0,222,2,0,2,0,2,0
4,335982,2007/08,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.5,0,5,0,0,222,2,0,2,0,3,0


### - Encoding :

In [45]:
# Encoding the batting and bowling team values to 0 and 1 

df = pd.get_dummies(data=df,columns={'batting_team','bowling_team'})

In [46]:
df.head(1)

Unnamed: 0,match_id,season,innings,overs,over,ball,total_runs,player_dismissed,total_team_score,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,batting_team_Chennai Super Kings,batting_team_Delhi Capitals,batting_team_Kings XI Punjab,batting_team_Kolkata Knight Riders,batting_team_Mumbai Indians,batting_team_Punjab Kings,batting_team_Rajasthan Royals,batting_team_Royal Challengers Bangalore,batting_team_Sunrisers Hyderabad,bowling_team_Chennai Super Kings,bowling_team_Delhi Capitals,bowling_team_Kings XI Punjab,bowling_team_Kolkata Knight Riders,bowling_team_Mumbai Indians,bowling_team_Punjab Kings,bowling_team_Rajasthan Royals,bowling_team_Royal Challengers Bangalore,bowling_team_Sunrisers Hyderabad
0,335982,2007/08,1,0.1,0,1,1,0,222,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [47]:
df = df[['match_id', 'batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals', 'batting_team_Kings XI Punjab','batting_team_Kolkata Knight Riders', 'batting_team_Mumbai Indians','batting_team_Punjab Kings', 'batting_team_Rajasthan Royals','batting_team_Royal Challengers Bangalore','batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings','bowling_team_Delhi Capitals', 'bowling_team_Kings XI Punjab','bowling_team_Kolkata Knight Riders', 'bowling_team_Mumbai Indians','bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals','bowling_team_Royal Challengers Bangalore','bowling_team_Sunrisers Hyderabad','overs', 'total_score', 'total_wickets','prev_30_runs', 'prev_30_wickets', 'prev_30_dot_balls', 'prev_30_boundaries','total_team_score']]

In [48]:
df.head(1)

Unnamed: 0,match_id,batting_team_Chennai Super Kings,batting_team_Delhi Capitals,batting_team_Kings XI Punjab,batting_team_Kolkata Knight Riders,batting_team_Mumbai Indians,batting_team_Punjab Kings,batting_team_Rajasthan Royals,batting_team_Royal Challengers Bangalore,batting_team_Sunrisers Hyderabad,bowling_team_Chennai Super Kings,bowling_team_Delhi Capitals,bowling_team_Kings XI Punjab,bowling_team_Kolkata Knight Riders,bowling_team_Mumbai Indians,bowling_team_Punjab Kings,bowling_team_Rajasthan Royals,bowling_team_Royal Challengers Bangalore,bowling_team_Sunrisers Hyderabad,overs,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,total_team_score
0,335982,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.1,1,0,1,0,0,0,222


### - Train_Test_Split :

In [49]:
X = df.drop(labels=['total_team_score','match_id'], axis=1)
y = df['total_team_score'].values

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42,stratify=y)

In [51]:
X_train=X_train.values
X_test=X_test.values
X_train=np.asarray(X_train).astype(np.float32)
X_test=np.asarray(X_test).astype(np.float32)

In [52]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(94123, 25) (31375, 25) (94123,) (31375,)


### - Linear Regression Model 

In [53]:
LR_model = LinearRegression()
LR_model.fit(X_train,y_train)

LinearRegression()

In [54]:
prediction=LR_model.predict(X_test)
mean_absolute_error (y_test, prediction)

17.32765613984492

#### Mean absolute error value is 17. The final predicted score will vary -17 from 17 from the actual total score

### - Function to predict scores :

In [55]:
def score_prediction(Bat_Team,Bowl_Team,overs,total_score,total_wickets,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries):

    temp_array = list()

    if Bat_Team == 'CSK':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bat_Team == 'DC':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bat_Team == 'KXIP':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bat_Team == 'KKR':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bat_Team == 'MI':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bat_Team == 'KXIP':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bat_Team == 'RR':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bat_Team == 'RCB':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bat_Team == 'SRH':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]

    if Bowl_Team == 'CSK':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bowl_Team == 'DC':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bowl_Team == 'KXIP':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bowl_Team == 'KKR':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bowl_Team == 'MI':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bowl_Team == 'KXIP':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bowl_Team == 'RR':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bowl_Team == 'RCB':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bowl_Team == 'SRH':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]

    temp_array = temp_array + [overs,total_score,total_wickets,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries]
    data = np.array([temp_array])
    my_prediction = int(LR_model.predict(data))

    print('Predicted score: ',my_prediction)
    print('Predicted score range: ',my_prediction - 17, 'to' , my_prediction + 17)

#### TESTING THIS MODEL WITH THE SCORECARD OF MATCH BETWEEN RR AND SRH ON MAY 02,2021

In [56]:
# After powerplay

Bat_Team = 'RR' 

Bowl_Team = 'SRH' 

overs = 7

total_score = 60 # current score

total_wickets = 1 # current wicket 

prev_runs_30 = 49 # runs in last 30 balls

prev_wickets_30 = 1 # wickets in last 30 balls

prev_30_dot_balls = 10 # dots in last 30 balls

prev_30_boundaries = 9 # boundaries in last 30 balls

score_prediction(Bat_Team,Bowl_Team,overs,total_score,total_wickets,
                         prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries)

Predicted score:  165
Predicted score range:  148 to 182


In [57]:
# After 10 overs

Bat_Team = 'RR' 

Bowl_Team = 'SRH' 

overs = 12.3

total_score = 103 # current score

total_wickets = 1 # current wicket 

prev_runs_30 = 41 # runs in last 30 balls

prev_wickets_30 = 0 # wickets in last 30 balls

prev_30_dot_balls = 7 # dots in last 30 balls

prev_30_boundaries = 4 # boundaries in last 30 balls

score_prediction(Bat_Team,Bowl_Team,overs,total_score,total_wickets,
                         prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries)

Predicted score:  169
Predicted score range:  152 to 186


In [58]:
# After 15 overs

Bat_Team = 'RR' 

Bowl_Team = 'SRH' 

overs = 16

total_score = 159 # current score

total_wickets = 1 # current wicket 

prev_runs_30 = 82 # runs in last 30 balls

prev_wickets_30 = 0 # wickets in last 30 balls

prev_30_dot_balls = 5 # dots in last 30 balls

prev_30_boundaries = 11 # boundaries in last 30 balls

score_prediction(Bat_Team,Bowl_Team,overs,total_score,total_wickets,
                         prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries)

Predicted score:  202
Predicted score range:  185 to 219


### Total runs scored by RR in this match after 20 overs is 220-3