# Imports

In [None]:
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn
import tensorflow as tf
import sklearn
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow import one_hot
from tensorflow.keras.utils import normalize
from tensorflow.keras.losses import BinaryCrossentropy, categorical_crossentropy, binary_crossentropy, CategoricalCrossentropy
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, SVR
from sklearn.metrics import r2_score
import math

# Data Preprocessing

In [None]:
data_deliveries = pd.read_csv("/content/drive/MyDrive/data/IPL Complete Dataset (2008-2019)/deliveries.csv")
data_deliveries.head(10)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,2,0,0,0,0,0,2,2,,,
5,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,6,S Dhawan,DA Warner,TS Mills,0,0,0,0,0,0,0,0,0,,,
6,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,7,S Dhawan,DA Warner,TS Mills,0,0,0,1,0,0,0,1,1,,,
7,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,2,1,S Dhawan,DA Warner,A Choudhary,0,0,0,0,0,0,1,0,1,,,
8,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,2,2,DA Warner,S Dhawan,A Choudhary,0,0,0,0,0,0,4,0,4,,,
9,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,2,3,DA Warner,S Dhawan,A Choudhary,0,0,0,0,1,0,0,1,1,,,


In [None]:
data_deliveries.isnull().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
is_super_over            0
wide_runs                0
bye_runs                 0
legbye_runs              0
noball_runs              0
penalty_runs             0
batsman_runs             0
extra_runs               0
total_runs               0
player_dismissed    170244
dismissal_kind      170244
fielder             172630
dtype: int64

In [None]:
data_matches = pd.read_csv("/content/drive/MyDrive/data/IPL Complete Dataset (2008-2019)/matches.csv")
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [None]:
data_matches.isnull().sum()

id                   0
season               0
city                 7
date                 0
team1                0
team2                0
toss_winner          0
toss_decision        0
result               0
dl_applied           0
winner               4
win_by_runs          0
win_by_wickets       0
player_of_match      4
venue                0
umpire1              2
umpire2              2
umpire3            637
dtype: int64

In [None]:
data_matches.drop(columns=["umpire3"], inplace = True)

In [None]:
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,


In [None]:
data_matches["city"].fillna("Dubai", inplace = True)

In [None]:
data_matches[data_matches["winner"].isnull()]

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
300,301,2011,Delhi,2011-05-21,Delhi Daredevils,Pune Warriors,Delhi Daredevils,bat,no result,0,,0,0,,Feroz Shah Kotla,SS Hazare,RJ Tucker
545,546,2015,Bangalore,2015-04-29,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,no result,0,,0,0,,M Chinnaswamy Stadium,JD Cloete,PG Pathak
570,571,2015,Bangalore,2015-05-17,Delhi Daredevils,Royal Challengers Bangalore,Royal Challengers Bangalore,field,no result,0,,0,0,,M Chinnaswamy Stadium,HDPK Dharmasena,K Srinivasan
744,11340,2019,Bengaluru,30/04/19,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,no result,0,,0,0,,M. Chinnaswamy Stadium,Nigel Llong,Ulhas Gandhe


In [None]:
data_matches["team1"].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [None]:
data_matches.replace(to_replace = "Deccan Chargers", value = "Sunrisers Hyderabad", inplace = True)

In [None]:
data_deliveries.replace(to_replace = "Deccan Chargers", value = "Sunrisers Hyderabad", inplace = True)

In [None]:
data_matches.replace(to_replace = "Delhi Daredevils", value = "Delhi Capitals", inplace = True)

In [None]:
data_deliveries.replace(to_replace = "Delhi Daredevils", value = "Delhi Capitals", inplace = True)

In [None]:
data_matches.replace(to_replace = "Rising Pune Supergiant", value = "Rising Pune Supergiants", inplace = True)

In [None]:
data_deliveries.replace(to_replace = "Rising Pune Supergiant", value = "Rising Pune Supergiants", inplace = True)

In [None]:
data_matches.replace(to_replace = "Pune Warriors", value = "Rising Pune Supergiants", inplace = True)

In [None]:
data_deliveries.replace(to_replace = "Pune Warriors", value = "Rising Pune Supergiants", inplace = True)

In [None]:
data_matches.drop(index = data_matches[data_matches["winner"].isna()].index, axis = 0, inplace=True)

In [None]:
data_deliveries.drop(index = data_matches[data_matches["winner"].isna()].index, axis = 0, inplace=True)

In [None]:
data_deliveries.drop(columns = ["dismissal_kind", "fielder"], inplace = True)

In [None]:
encoder_teams = LabelEncoder()
teams = data_matches["team1"].unique()
encoder_teams.fit(teams)
data_matches["team1"] = encoder_teams.fit_transform(data_matches["team1"])
data_matches["team2"] = encoder_teams.fit_transform(data_matches["team2"])
data_matches["toss_winner"] = encoder_teams.fit_transform(data_matches["toss_winner"])
data_matches["winner"] = encoder_teams.fit_transform(data_matches["winner"])
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,10,9,9,field,normal,0,10,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,6,8,8,field,normal,0,8,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,2,5,5,field,normal,0,5,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,8,3,3,field,normal,0,3,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,9,1,9,bat,normal,0,9,15,0,KM Jadhav,M Chinnaswamy Stadium,,


In [None]:
data_deliveries["batting_team"] = encoder_teams.fit_transform(data_deliveries["batting_team"])
data_deliveries["bowling_team"] = encoder_teams.fit_transform(data_deliveries["bowling_team"])
data_deliveries

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed
0,1,1,10,9,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,
1,1,1,10,9,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,
2,1,1,10,9,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,
3,1,1,10,9,1,4,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,
4,1,1,10,9,1,5,DA Warner,S Dhawan,TS Mills,0,2,0,0,0,0,0,2,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179073,11415,2,0,6,20,2,RA Jadeja,SR Watson,SL Malinga,0,0,0,0,0,0,1,0,1,
179074,11415,2,0,6,20,3,SR Watson,RA Jadeja,SL Malinga,0,0,0,0,0,0,2,0,2,
179075,11415,2,0,6,20,4,SR Watson,RA Jadeja,SL Malinga,0,0,0,0,0,0,1,0,1,SR Watson
179076,11415,2,0,6,20,5,SN Thakur,RA Jadeja,SL Malinga,0,0,0,0,0,0,2,0,2,


In [None]:
encoder_players = LabelEncoder()
batsmen = data_deliveries["batsman"]
non_strikers = data_deliveries["non_striker"]
bowlers = data_deliveries["bowler"]
players = batsmen + non_strikers + bowlers
players = players.unique()
encoder_players.fit(players)
data_matches["player_of_match"] = encoder_players.fit_transform(data_matches["player_of_match"])
data_deliveries["batsman"] = encoder_players.fit_transform(data_deliveries["batsman"])
data_deliveries["non_striker"] = encoder_players.fit_transform(data_deliveries["non_striker"])
data_deliveries["bowler"] = encoder_players.fit_transform(data_deliveries["bowler"])

In [None]:
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,10,9,9,field,normal,0,10,35,0,224,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,6,8,8,field,normal,0,8,0,7,199,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,2,5,5,field,normal,0,5,0,10,34,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,8,3,3,field,normal,0,3,0,6,61,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,9,1,9,bat,normal,0,9,15,0,99,M Chinnaswamy Stadium,,


In [None]:
data_deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,


In [None]:
encoder_venue = LabelEncoder()
venues = data_matches["venue"].unique()
encoder_venue.fit(teams)
data_matches["venue"] = encoder_venue.fit_transform(data_matches["venue"])
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,10,9,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,6,8,8,field,normal,0,8,0,7,199,21,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,2,5,5,field,normal,0,5,0,10,34,31,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,8,3,3,field,normal,0,3,0,6,61,13,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,9,1,9,bat,normal,0,9,15,0,99,17,,


# Feature Engineering

## Creating some own features

### Feature 1 - Score so far

In [None]:
ids = data_deliveries["match_id"].unique()
score = []
innings = data_deliveries["inning"].unique()

for id in list(ids):
  df = data_deliveries[data_deliveries["match_id"] == id]

  for inning in innings:

    df1 = df[df["inning"] == inning]
    score1 = df1["total_runs"].cumsum()
    score1 = list(score1)
    score = score + score1   

data_deliveries.insert(18, "score", score)
data_deliveries

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179073,11415,2,0,6,20,2,361,442,340,0,0,0,0,0,0,1,0,1,152,
179074,11415,2,0,6,20,3,444,360,340,0,0,0,0,0,0,2,0,2,154,
179075,11415,2,0,6,20,4,444,360,340,0,0,0,0,0,0,1,0,1,155,SR Watson
179076,11415,2,0,6,20,5,437,360,340,0,0,0,0,0,0,2,0,2,157,


### Feature 2 - Current Wicket

In [None]:
current_wickets = []
ids = data_deliveries["match_id"].unique()
innings = data_deliveries["inning"].unique()

for id in list(ids):
  df = data_deliveries[data_deliveries["match_id"] == id]

  for inning in innings:

    df1 = df[df["inning"] == inning]
    current_wickets1 = df1["player_dismissed"]
    items = np.array(current_wickets1[current_wickets1.notna()])
    for item in items:
      current_wickets1.replace(item, 1, inplace = True)

    current_wickets1 = current_wickets1.fillna(0)
    current_wickets1 = current_wickets1.cumsum()
    current_wickets1 = list(current_wickets1)
    current_wickets = current_wickets + current_wickets1

data_deliveries.insert(20, "current_wickets", current_wickets)
data_deliveries

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179073,11415,2,0,6,20,2,361,442,340,0,0,0,0,0,0,1,0,1,152,,5.0
179074,11415,2,0,6,20,3,444,360,340,0,0,0,0,0,0,2,0,2,154,,5.0
179075,11415,2,0,6,20,4,444,360,340,0,0,0,0,0,0,1,0,1,155,SR Watson,6.0
179076,11415,2,0,6,20,5,437,360,340,0,0,0,0,0,0,2,0,2,157,,6.0


### Feature 3 - Current Batsman's Runs

In [None]:
data = pd.DataFrame(columns=['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'score', 'player_dismissed',
       'current_wickets','current_batsman_runs'])
ids = data_deliveries["match_id"].unique()
for id in ids:
  df = data_deliveries[data_deliveries["match_id"] == id]
  batsmen = df["batsman"].unique()
  for batsman in batsmen:
    df1 = df[df["batsman"] == batsman]
    current_batsman_runs = df1["batsman_runs"].cumsum()
    df1.insert(21, "current_batsman_runs", current_batsman_runs)
    data = data.append(df1)

In [None]:
data_deliveries = pd.merge(data_deliveries, data)

### Feature 4 - Current Bowler's Wickets

In [None]:
data = pd.DataFrame(columns=['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'score', 'player_dismissed',
       'current_wickets',
       'current_batsman_runs', 'current_bowler_wicket']
      )
ids = ids = data_deliveries["match_id"].unique()
for id in ids:
  df = data_deliveries[data_deliveries["match_id"] == id]
  bowlers = df["bowler"].unique()
  for bowler in bowlers:
    df1 = df[df["bowler"] == bowler]
    current_wickets1 = df1["player_dismissed"]
    items = np.array(current_wickets1[current_wickets1.notna()])
    for item in items:
      current_wickets1.replace(item, 1, inplace = True)
    current_wickets1 = current_wickets1.fillna(0)
    current_wickets = current_wickets1.cumsum()
    df1.insert(22, "current_bowler_wicket", current_wickets)
    data = data.append(df1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
data_deliveries = pd.merge(data_deliveries, data)

### Feature 5 - Runs Scored in last 5 overs

In [None]:
# in first 5 overs we will show score till that point
# after that we will show (score till that point - score before 5 overs)

ids = data_deliveries["match_id"].unique()
last_5_overs_runs = []
innings = data_deliveries["inning"].unique()
for id in ids:
  df = data_deliveries[data_deliveries["match_id"] == id]
  for inning in innings:
    df1 = df[df["inning"] == inning]
    df2 = df1["score"]
    first_5_overs = df2[0:30]
    first_5_overs = list(first_5_overs)
    last_5_overs_runs = last_5_overs_runs + first_5_overs
    df3 = df2[30:]
    df3 = np.array(df3)
    df4 = df2[0:len(df3)]
    df4 = np.array(df4)
    next_5_overs = df3 - df4
    next_5_overs = list(next_5_overs)
    last_5_overs_runs = last_5_overs_runs + next_5_overs

data_deliveries.insert(21, "last_5_overs_runs", last_5_overs_runs)
data_deliveries

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,current_batsman_runs,current_bowler_wicket
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0,0.0
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0,0.0
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,4,0.0
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,4,0.0
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170351,11415,2,0,6,19,6,361,442,153,0,0,4,0,0,0,4,4,8,150,,5.0,61,8,2.0
170352,11415,2,0,6,20,1,444,360,340,0,0,0,0,0,0,1,0,1,151,,5.0,62,81,0.0
170353,11415,2,0,6,20,2,361,442,340,0,0,0,0,0,0,1,0,1,152,,5.0,63,9,0.0
170354,11415,2,0,6,20,3,444,360,340,0,0,0,0,0,0,2,0,2,154,,5.0,65,83,0.0


### Feature 6 - Wickets taken in last 2 overs

In [None]:
# in first 2 overs we will show wickets till that point
# after that we will show (wickets till that point - wickets before 2 overs)

ids = data_deliveries["match_id"].unique()
last_2_overs_wickets = []
innings = data_deliveries["inning"].unique()
for id in ids:
  df = data_deliveries[data_deliveries["match_id"] == id]
  for inning in innings:
    df1 = df[df["inning"] == inning]
    df2 = df1["current_wickets"]
    first_2_overs = df2[0:12]
    first_2_overs = list(first_2_overs)
    last_2_overs_wickets = last_2_overs_wickets + first_2_overs
    df3 = df2[12:]
    df3 = np.array(df3)
    df4 = df2[0:len(df3)]
    df4 = np.array(df4)
    next_2_overs = df3 - df4
    next_2_overs = list(next_2_overs)
    last_2_overs_wickets = last_2_overs_wickets + next_2_overs

data_deliveries.insert(22, "last_2_overs_wickets", last_2_overs_wickets)
data_deliveries

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,0.0,4,0.0
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,0.0,4,0.0
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,0.0,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170351,11415,2,0,6,19,6,361,442,153,0,0,4,0,0,0,4,4,8,150,,5.0,61,1.0,8,2.0
170352,11415,2,0,6,20,1,444,360,340,0,0,0,0,0,0,1,0,1,151,,5.0,62,1.0,81,0.0
170353,11415,2,0,6,20,2,361,442,340,0,0,0,0,0,0,1,0,1,152,,5.0,63,1.0,9,0.0
170354,11415,2,0,6,20,3,444,360,340,0,0,0,0,0,0,2,0,2,154,,5.0,65,1.0,83,0.0


### Feature 7 - is_wicket

In [None]:
current_wickets = data_deliveries["player_dismissed"]
current_wickets1 = current_wickets[current_wickets.notna()]
items = current_wickets1.unique()
for item in items:
  current_wickets.replace(item, 1, inplace = True)
current_wickets = current_wickets.fillna(0)
data_deliveries.insert(25, "is_wicket", current_wickets)
data_deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket,is_wicket
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,0.0,4,0.0,0
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,0.0,4,0.0,0
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,0.0,4,0.0,0


# Scaling Features

In [None]:
'''columns = ['batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 
       'batsman_runs', 'total_runs', 'score',
       'current_wickets', 'last_5_overs_runs', 'last_2_overs_wickets',
       'current_batsman_runs', 'current_bowler_wicket']
for column in columns:
  data = data_deliveries[column].unique()
  data1 = np.array(data_deliveries[column])
  data1 = data1.reshape((-1, 1))
  scalar = MinMaxScaler()
  data = np.array(data)
  data = data.reshape((-1, 1))
  scalar.fit(data)
  data_deliveries[column] = scalar.fit_transform(data1)

data_deliveries.head()'''

"columns = ['batting_team', 'bowling_team', 'over', 'ball',\n       'batsman', 'non_striker', 'bowler', \n       'batsman_runs', 'total_runs', 'score',\n       'current_wickets', 'last_5_overs_runs', 'last_2_overs_wickets',\n       'current_batsman_runs', 'current_bowler_wicket']\nfor column in columns:\n  data = data_deliveries[column].unique()\n  data1 = np.array(data_deliveries[column])\n  data1 = data1.reshape((-1, 1))\n  scalar = MinMaxScaler()\n  data = np.array(data)\n  data = data.reshape((-1, 1))\n  scalar.fit(data)\n  data_deliveries[column] = scalar.fit_transform(data1)\n\ndata_deliveries.head()"

In [None]:
'''columns = ["team1", "team2", "toss_winner", "winner", "win_by_runs"
          , "win_by_wickets", "player_of_match", "venue"]
for column in columns:
  data = data_matches[column].unique()
  data1 = np.array(data_matches[column])
  data1 = data1.reshape((-1, 1))
  scalar = MinMaxScaler()
  data = np.array(data)
  data = data.reshape((-1, 1))
  scalar.fit(data)
  data_matches[column] = scalar.fit_transform(data1)

data_matches.head()'''

'columns = ["team1", "team2", "toss_winner", "winner", "win_by_runs"\n          , "win_by_wickets", "player_of_match", "venue"]\nfor column in columns:\n  data = data_matches[column].unique()\n  data1 = np.array(data_matches[column])\n  data1 = data1.reshape((-1, 1))\n  scalar = MinMaxScaler()\n  data = np.array(data)\n  data = data.reshape((-1, 1))\n  scalar.fit(data)\n  data_matches[column] = scalar.fit_transform(data1)\n\ndata_matches.head()'

In [None]:
data_deliveries_copy = data_deliveries

In [None]:
data_deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket,is_wicket
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,0.0,4,0.0,0
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,0.0,4,0.0,0
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,0.0,4,0.0,0


In [None]:
data_matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,10,9,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,6,8,8,field,normal,0,8,0,7,199,21,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,2,5,5,field,normal,0,5,0,10,34,31,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,8,3,3,field,normal,0,3,0,6,61,13,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,9,1,9,bat,normal,0,9,15,0,99,17,,


In [None]:
data_matches.columns

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2'],
      dtype='object')

In [None]:
data_matches.columns = ['match_id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2']
data_matches.head()

Unnamed: 0,match_id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2017,Hyderabad,2017-04-05,10,9,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
1,2,2017,Pune,2017-04-06,6,8,8,field,normal,0,8,0,7,199,21,A Nand Kishore,S Ravi
2,3,2017,Rajkot,2017-04-07,2,5,5,field,normal,0,5,0,10,34,31,Nitin Menon,CK Nandan
3,4,2017,Indore,2017-04-08,8,3,3,field,normal,0,3,0,6,61,13,AK Chaudhary,C Shamshuddin
4,5,2017,Bangalore,2017-04-08,9,1,9,bat,normal,0,9,15,0,99,17,,


In [None]:
data_merged = pd.merge(data_deliveries, data_matches)
data_merged.drop(columns=["team1", "team2"], inplace = True)
data_merged

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket,is_wicket,season,city,date,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170002,11415,2,0,6,19,6,361,442,153,0,0,4,0,0,0,4,4,8,150,,5.0,61,1.0,8,2.0,0,2019,Hyderabad,12/05/19,6,bat,normal,0,6,1,0,85,29,Nitin Menon,Ian Gould
170003,11415,2,0,6,20,1,444,360,340,0,0,0,0,0,0,1,0,1,151,,5.0,62,1.0,81,0.0,0,2019,Hyderabad,12/05/19,6,bat,normal,0,6,1,0,85,29,Nitin Menon,Ian Gould
170004,11415,2,0,6,20,2,361,442,340,0,0,0,0,0,0,1,0,1,152,,5.0,63,1.0,9,0.0,0,2019,Hyderabad,12/05/19,6,bat,normal,0,6,1,0,85,29,Nitin Menon,Ian Gould
170005,11415,2,0,6,20,3,444,360,340,0,0,0,0,0,0,2,0,2,154,,5.0,65,1.0,83,0.0,0,2019,Hyderabad,12/05/19,6,bat,normal,0,6,1,0,85,29,Nitin Menon,Ian Gould


# ML Models

## Regression Task

In [None]:
data_for_regression = data_merged[["over", "ball", "batting_team", "bowling_team", 
                                   "batsman", "non_striker", "bowler", "score", 
                                   "last_5_overs_runs", "last_2_overs_wickets", 
                                    "current_batsman_runs", "current_bowler_wicket", "venue", "is_wicket"]]

data_for_regression

Unnamed: 0,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,score,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket,venue,is_wicket
0,1,1,10,9,112,391,379,0,0,0.0,0,0.0,28,0
1,1,2,10,9,112,391,379,0,0,0.0,0,0.0,28,0
2,1,3,10,9,112,391,379,4,4,0.0,4,0.0,28,0
3,1,4,10,9,112,391,379,4,4,0.0,4,0.0,28,0
4,1,5,10,9,112,391,379,6,6,0.0,4,0.0,28,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170002,19,6,0,6,361,442,153,150,61,1.0,8,2.0,29,0
170003,20,1,0,6,444,360,340,151,62,1.0,81,0.0,29,0
170004,20,2,0,6,361,442,340,152,63,1.0,9,0.0,29,0
170005,20,3,0,6,444,360,340,154,65,1.0,83,0.0,29,0


In [None]:
data_for_regression.corr()

Unnamed: 0,last_5_overs_runs,last_2_overs_wickets,current_bowler_wicket,venue,is_wicket
last_5_overs_runs,1.0,0.093695,0.127851,-0.01644,
last_2_overs_wickets,0.093695,1.0,0.425725,-0.000722,
current_bowler_wicket,0.127851,0.425725,1.0,-0.000346,
venue,-0.01644,-0.000722,-0.000346,1.0,
is_wicket,,,,,


In [None]:
data_for_regression_x = data_for_regression.iloc[:,[0,1,2,3,4,5,6,8,9,10,11,12]].values
data_for_regression_y = data_for_regression.iloc[:,7].values

x_train, x_test, y_train, y_test = train_test_split(data_for_regression_x, data_for_regression_y, random_state=0, test_size = 0.25)

In [None]:
y_train = y_train.astype('int')
x_train = x_train.astype('int')

In [None]:
'''minmax_sc_x = MinMaxScaler()
x_train_minmax = minmax_sc_x.fit_transform(x_train)
x_test_minmax = minmax_sc_x.transform(x_test)

y_train = np.array(y_train)
y_train = y_train.reshape((-1, 1))

y_test = np.array(y_test)
y_test = y_test.reshape((-1, 1))

minmax_sc_y = MinMaxScaler()
y_train_minmax = minmax_sc_y.fit_transform(y_train)
y_test_minmax = minmax_sc_y.transform(y_train)

std_sc_x = StandardScaler()
x_train_std = std_sc_x.fit_transform(x_train)
x_test_std = std_sc_x.transform(x_test)

std_sc_y = StandardScaler()
y_train_std = std_sc_y.fit_transform(y_train)
y_test_std = std_sc_y.transform(y_test)'''

In [None]:
print(f"shape of x_train = {x_train.shape}")
print(f"shape of x_test = {x_test.shape}")
print(f"shape of x_train_minmax = {x_train_minmax.shape}")
print(f"shape of x_test_minmax = {x_test_minmax.shape}")
print(f"shape of x_train_std = {x_train_std.shape}")
print(f"shape of x_test_std = {x_test_std.shape}")
print(f"shape of y_train = {y_train.shape}")
print(f"shape of y_test = {y_test.shape}")
print(f"shape of y_train_minmax = {y_train_minmax.shape}")
print(f"shape of y_test_minmax = {y_test_minmax.shape}")
print(f"shape of y_train_std = {y_train_std.shape}")
print(f"shape of y_test_std = {y_test_std.shape}")

shape of x_train = (127505, 12)
shape of x_test = (42502, 12)
shape of x_train_minmax = (127505, 12)
shape of x_test_minmax = (42502, 12)
shape of x_train_std = (127505, 12)
shape of x_test_std = (42502, 12)
shape of y_train = (127505, 1)
shape of y_test = (42502, 1)
shape of y_train_minmax = (127505, 1)
shape of y_test_minmax = (127505, 1)
shape of y_train_std = (127505, 1)
shape of y_test_std = (42502, 1)


In [None]:
# some metrics

def RMSE(y_actual, y_predicted):
  y_actual = np.array(y_actual)
  y_predicted = np.array(y_predicted)
  y_error = y_actual - y_predicted
  y_error_sq = y_error * y_error
  y_rmse = np.sqrt(np.mean(y_error_sq))
  return y_rmse

def RMSLE(y_actual, y_predicted):
  y_actual = np.array(y_actual)
  y_predicted = np.array(y_predicted)
  y_error_log = np.log(y_actual + 1) - np.log(y_predicted + 1)
  y_error_log_sq = y_error_log * y_error_log
  y_rmsle = np.sqrt(np.mean(y_error_log_sq))
  return y_rmsle

def MAE(y_actual, y_predicted):
  y_actual = np.array(y_actual)
  y_predicted = np.array(y_predicted)
  y_error = y_actual - y_predicted
  y_error_abs = np.fabs(y_error)
  y_mae = np.mean(y_error_abs)
  return y_mae

def accuracy_in_range(y_actual, y_predicted, threshold = 10, show_within_Threshold = True):
  y_actual = np.array(y_actual)
  y_predicted = np.array(y_predicted)
  y_error = y_actual - y_predicted
  y_error_abs = abs(y_error)
  correct_prediction = np.where(y_error_abs <= threshold)[0]
  correct_prediction_count = len(correct_prediction)
  percentage_accuracy = (correct_prediction_count / len(y_predicted)) * 100
  if show_within_Threshold:
    print(correct_prediction)
    print(f"There are {correct_prediction_count} prediction within Threshold")
  return percentage_accuracy

def roundup(a, digits=6):
  n = 10**-digits
  return np.round(np.ceil(a / n) * n, digits)

In [None]:
def make_dataframe_and_compair(y_actual, y_predicted):
  d = {
      "y_actual" : y_actual,
      "y_predicted" : y_predicted
  }
  d = pd.DataFrame(data = d)
  return d

### Linear Regression

In [None]:
Linear_regression_minmax = LinearRegression()
Linear_regression_minmax.fit(x_train_minmax, y_train_minmax)
y_predict_train_minmax = Linear_regression_minmax.predict(x_train_minmax)
y_predict_train_minmax = np.array(y_predict_train_minmax)
y_predict_train_minmax = roundup(y_predict_train_minmax)
y_predict_test_minmax = Linear_regression_minmax.predict(x_test_minmax)
y_predict_test_minmax = np.array(y_predict_test_minmax)
y_predict_test_minmax = roundup(y_predict_test_minmax)
y_score_predict_train_minmax = minmax_sc_y.inverse_transform(y_predict_train_minmax)
y_score_predict_test_minmax = minmax_sc_y.inverse_transform(y_predict_test_minmax)
y_train_minmax_inversed = minmax_sc_y.inverse_transform(y_train_minmax)
y_test_minmax_inversed = minmax_sc_y.inverse_transform(y_test_minmax)

Linear_regression_std = LinearRegression()
Linear_regression_std.fit(x_train_std, y_train_std)
y_predict_train_std = Linear_regression_std.predict(x_train_std)
y_predict_train_std = np.array(y_predict_train_std)
y_predict_train_std = roundup(y_predict_train_std)
y_predict_test_std = Linear_regression_std.predict(x_test_std)
y_predict_test_std = np.array(y_predict_test_std)
y_predict_test_std = roundup(y_predict_test_std)
y_score_predict_train_std = std_sc_y.inverse_transform(y_predict_train_std)
y_score_predict_test_std = std_sc_y.inverse_transform(y_predict_test_std)
y_train_std_inversed = std_sc_y.inverse_transform(y_train_std)
y_test_std_inversed = std_sc_y.inverse_transform(y_test_std)

In [None]:
d = make_dataframe_and_compair(y_train[:,0], np.round(y_score_predict_train_std[:,0]))
d

Unnamed: 0,y_actual,y_predicted
0,90,90.0
1,109,95.0
2,25,17.0
3,3,-6.0
4,81,82.0
...,...,...
127500,70,76.0
127501,34,38.0
127502,188,155.0
127503,131,123.0


In [None]:
r2_score_test_minmax = r2_score(y_test, np.round(y_score_predict_test_minmax))
r2_score_test_std = r2_score(y_test, np.round(y_score_predict_test_std))
r2_score_train_minmax = r2_score(y_train, np.round(y_score_predict_train_minmax))
r2_score_train_std = r2_score(y_train, np.round(y_score_predict_train_std))
print(f"R2 score test for minmax = {r2_score_test_minmax}")
print(f"R2 score test for std = {r2_score_test_std}")
print(f"R2 score train for minmax = {r2_score_train_minmax}")
print(f"R2 score train for std = {r2_score_train_std}")

accuracy_in_range_test_minmax = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_minmax[:,0]), show_within_Threshold=False)
accuracy_in_range_test_std = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_std[:,0]), show_within_Threshold=False)
accuracy_in_range_train_minmax = accuracy_in_range(y_train[:,0], np.round(y_score_predict_train_minmax[:,0]), show_within_Threshold=False)
accuracy_in_range_train_std = accuracy_in_range(y_train[:,0], np.round(y_score_predict_train_std[:,0]), show_within_Threshold=False)
print(f"Accuracy in range test for minmax = {accuracy_in_range_test_minmax}")
print(f"Accuracy in range test for std = {accuracy_in_range_test_std}")
print(f"Accuracy in range train for minmax = {accuracy_in_range_train_minmax}")
print(f"Accuracy in range train for std = {accuracy_in_range_train_std}")

R2 score test for minmax = 0.9247408645904286
R2 score test for std = 0.9247408340604704
R2 score train for minmax = 0.9241575020951092
R2 score train for std = 0.9241573924347072
Accuracy in range test for minmax = 60.053644534374854
Accuracy in range test for std = 60.053644534374854
Accuracy in range train for minmax = 60.53095957021293
Accuracy in range train for std = 60.53095957021293


### Random Forest

In [None]:
'''Random_Forest_minmax = RandomForestClassifier()
Random_Forest_minmax.fit(x_train, y_train)
y_predict_train_minmax = Random_Forest_minmax.predict(x_train)
y_predict_train_minmax = np.array(y_predict_train_minmax)
#y_predict_train_minmax = roundup(y_predict_train_minmax)
y_predict_test_minmax = Random_Forest_minmax.predict(x_test)
y_predict_test_minmax = np.array(y_predict_test_minmax)
#y_predict_test_minmax = roundup(y_predict_test_minmax)
#y_score_predict_train_minmax = minmax_sc_y.inverse_transform(y_predict_train_minmax)
#y_score_predict_test_minmax = minmax_sc_y.inverse_transform(y_predict_test_minmax)

Random_Forest_std = RandomForestClassifier()
Random_Forest_std.fit(x_train_std, y_train_std)
y_predict_train_std = Random_Forest_std.predict(x_train_std)
y_predict_train_std = np.array(y_predict_train_std)
y_predict_train_std = roundup(y_predict_train_std)
y_predict_test_std = Random_Forest_std.predict(x_test_std)
y_predict_test_std = np.array(y_predict_test_std)
y_predict_test_std = roundup(y_predict_test_std)
y_score_predict_train_std = std_sc_y.inverse_transform(y_predict_train_std)
y_score_predict_test_std = std_sc_y.inverse_transform(y_predict_test_std)'''

In [None]:
'''d = make_dataframe_and_compair(y_train[:,0], np.round(y_score_predict_train_std[:,0]))
d'''

'd = make_dataframe_and_compair(y_train[:,0], np.round(y_score_predict_train_std[:,0]))\nd'

In [None]:
'''r2_score_test_minmax = r2_score(y_test, np.round(y_score_predict_test_minmax))
r2_score_test_std = r2_score(y_test, np.round(y_score_predict_test_std))
r2_score_train_minmax = r2_score(y_train, np.round(y_score_predict_train_minmax))
r2_score_train_std = r2_score(y_train, np.round(y_score_predict_train_std))
print(f"R2 score test for minmax = {r2_score_test_minmax}")
print(f"R2 score test for std = {r2_score_test_std}")
print(f"R2 score train for minmax = {r2_score_train_minmax}")
print(f"R2 score train for std = {r2_score_train_std}")

accuracy_in_range_test_minmax = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_minmax[:,0]), show_within_Threshold=False)
accuracy_in_range_test_std = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_std[:,0]), show_within_Threshold=False)
accuracy_in_range_train_minmax = accuracy_in_range(y_train[:,0], np.round(y_score_predict_train_minmax[:,0]), show_within_Threshold=False)
accuracy_in_range_train_std = accuracy_in_range(y_train[:,0], np.round(y_score_predict_train_std[:,0]), show_within_Threshold=False)
print(f"Accuracy in range test for minmax = {accuracy_in_range_test_minmax}")
print(f"Accuracy in range test for std = {accuracy_in_range_test_std}")
print(f"Accuracy in range train for minmax = {accuracy_in_range_train_minmax}")
print(f"Accuracy in range train for std = {accuracy_in_range_train_std}")'''

'r2_score_test_minmax = r2_score(y_test, np.round(y_score_predict_test_minmax))\nr2_score_test_std = r2_score(y_test, np.round(y_score_predict_test_std))\nr2_score_train_minmax = r2_score(y_train, np.round(y_score_predict_train_minmax))\nr2_score_train_std = r2_score(y_train, np.round(y_score_predict_train_std))\nprint(f"R2 score test for minmax = {r2_score_test_minmax}")\nprint(f"R2 score test for std = {r2_score_test_std}")\nprint(f"R2 score train for minmax = {r2_score_train_minmax}")\nprint(f"R2 score train for std = {r2_score_train_std}")\n\naccuracy_in_range_test_minmax = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_minmax[:,0]), show_within_Threshold=False)\naccuracy_in_range_test_std = accuracy_in_range(y_test[:,0], np.round(y_score_predict_test_std[:,0]), show_within_Threshold=False)\naccuracy_in_range_train_minmax = accuracy_in_range(y_train[:,0], np.round(y_score_predict_train_minmax[:,0]), show_within_Threshold=False)\naccuracy_in_range_train_std = accuracy_

### SVR

In [None]:
'''SVR_minmax = SVR()
SVR_minmax.fit(x_train_minmax, y_train_minmax)
y_predict_train_minmax = SVR_minmax.predict(x_train_minmax)
y_predict_train_minmax = np.array(y_predict_train_minmax)
y_predict_train_minmax = roundup(y_predict_train_minmax)
y_predict_test_minmax = SVR_minmax.predict(x_test_minmax)
y_predict_test_minmax = np.array(y_predict_test_minmax)
y_predict_test_minmax = roundup(y_predict_test_minmax)
y_score_predict_train_minmax = minmax_sc_y.inverse_transform(y_predict_train_minmax)
y_score_predict_test_minmax = minmax_sc_y.inverse_transform(y_predict_test_minmax)

SVR_std = SVR()
SVR_std.fit(x_train_std, y_train_std)
y_predict_train_std = SVR_std.predict(x_train_std)
y_predict_train_std = np.array(y_predict_train_std)
y_predict_train_std = roundup(y_predict_train_std)
y_predict_test_std = SVR_std.predict(x_test_std)
y_predict_test_std = np.array(y_predict_test_std)
y_predict_test_std = roundup(y_predict_test_std)
y_score_predict_train_std = std_sc_y.inverse_transform(y_predict_train_std)
y_score_predict_test_std = std_sc_y.inverse_transform(y_predict_test_std)'''

'SVR_minmax = SVR()\nSVR_minmax.fit(x_train_minmax, y_train_minmax)\ny_predict_train_minmax = SVR_minmax.predict(x_train_minmax)\ny_predict_train_minmax = np.array(y_predict_train_minmax)\ny_predict_train_minmax = roundup(y_predict_train_minmax)\ny_predict_test_minmax = SVR_minmax.predict(x_test_minmax)\ny_predict_test_minmax = np.array(y_predict_test_minmax)\ny_predict_test_minmax = roundup(y_predict_test_minmax)\ny_score_predict_train_minmax = minmax_sc_y.inverse_transform(y_predict_train_minmax)\ny_score_predict_test_minmax = minmax_sc_y.inverse_transform(y_predict_test_minmax)\n\nSVR_std = SVR()\nSVR_std.fit(x_train_std, y_train_std)\ny_predict_train_std = SVR_std.predict(x_train_std)\ny_predict_train_std = np.array(y_predict_train_std)\ny_predict_train_std = roundup(y_predict_train_std)\ny_predict_test_std = SVR_std.predict(x_test_std)\ny_predict_test_std = np.array(y_predict_test_std)\ny_predict_test_std = roundup(y_predict_test_std)\ny_score_predict_train_std = std_sc_y.inverse_

### SVM

In [None]:
'''svc = SVC()
y_train = y_train.astype("int")
y_train_svc = y_train.reshape((y_train.shape[0],))
y_test_svc = y_test.reshape((y_test.shape[0],))
svc.fit(x_train, y_train_svc)
y_predict_train = svc.predict(x_train)
y_predict_train = np.array(y_predict_train)
y_predict_test = svc.predict(x_test)
y_predict_test = np.array(y_predict_test)'''

# DL Models

In [None]:
data_for_dl = data_merged[["over", "ball", "last_5_overs_runs", "last_2_overs_wickets", 
                           "current_batsman_runs", "current_bowler_wicket", "current_wickets",
                           "venue", "is_wicket", "total_runs"]]

'''features = ["over", "ball", "last_5_overs_runs", "last_2_overs_wickets", 
            "current_batsman_runs", "current_bowler_wicket", "current_wickets",
            "venue", "score"]

for feature in features:
  data_for_dl[feature] = normalize(list(data_for_dl[feature]))'''



'features = ["over", "ball", "last_5_overs_runs", "last_2_overs_wickets", \n            "current_batsman_runs", "current_bowler_wicket", "current_wickets",\n            "venue", "score"]\n\nfor feature in features:\n  data_for_dl[feature] = normalize(list(data_for_dl[feature]))'

In [None]:
data_for_dl.isnull().sum()

over                     0
ball                     0
last_5_overs_runs        0
last_2_overs_wickets     0
current_batsman_runs     0
current_bowler_wicket    0
current_wickets          0
venue                    0
is_wicket                0
score                    0
dtype: int64

In [None]:
data_for_dl_x = data_for_dl.iloc[:,[0,1,2,3,4,5,6,7]].values
data_for_dl_wicket = data_for_dl.iloc[:,8].values
data_for_dl_runs = data_for_dl.iloc[:,9].values

x_train_dl_wicket, x_test_dl_wicket, y_train_dl_wicket, y_test_dl_wicket = train_test_split(data_for_dl_x, data_for_dl_wicket, random_state=0, test_size = 0.25)
x_train_dl_runs, x_test_dl_runs, y_train_dl_runs, y_test_dl_runs = train_test_split(data_for_dl_x, data_for_dl_runs, random_state=0, test_size = 0.25)

x_train_dl_wicket = x_train_dl_wicket.astype("float")
x_test_dl_wicket = x_test_dl_wicket.astype("float")
y_train_dl_wicket = y_train_dl_wicket.astype("float")
y_test_dl_wicket = y_test_dl_wicket.astype("float")

x_train_dl_runs = x_train_dl_runs.astype("float")
x_test_dl_runs = x_test_dl_runs.astype("float")
y_train_dl_runs = y_train_dl_runs.astype("float")
score_max_train = max(y_train_dl_runs)
y_train_dl_runs_scaled = y_train_dl_runs/score_max_train
y_test_dl_runs = y_test_dl_runs.astype("float")
score_max_test = max(y_test_dl_runs)
y_test_dl_runs_scaled = y_test_dl_runs/score_max_test
#y_train_dl_runs = y_train_dl_runs.reshape((-1, 1))
#y_test_dl_runs = y_test_dl_runs.reshape((-1, 1))
depth = len(data_for_dl["total_runs"].unique())
y_train_dl_runs_categorical = one_hot(y_train_dl_runs, depth = depth)
y_test_dl_runs_categorical = one_hot(y_test_dl_runs, depth = depth)

minmax_sc_x = MinMaxScaler()
x_train_dl_runs_minmax = minmax_sc_x.fit_transform(x_train_dl_runs)
x_test_dl_runs_minmax = minmax_sc_x.transform(x_test_dl_runs)

'''minmax_sc_y = MinMaxScaler()
y_train_dl_runs_minmax = minmax_sc_y.fit_transform(y_train_dl_runs)
y_test_dl_runs_minmax = minmax_sc_y.transform(y_test_dl_runs)'''

'minmax_sc_y = MinMaxScaler()\ny_train_dl_runs_minmax = minmax_sc_y.fit_transform(y_train_dl_runs)\ny_test_dl_runs_minmax = minmax_sc_y.transform(y_test_dl_runs)'

In [None]:
print(f"x_train_dl_wickets = {x_train_dl_wicket.shape}")
print(f"x_test_dl_wickets = {x_test_dl_wicket.shape}")
print(f"x_train_dl_runs = {x_train_dl_runs.shape}")
print(f"x_test_dl_runs = {x_test_dl_runs.shape}")
print(f"y_train_dl_wicket = {y_train_dl_wicket.shape}")
print(f"y_test_dl_wicket = {y_test_dl_wicket.shape}")
print(f"y_train_dl_runs = {y_train_dl_runs.shape}")
print(f"y_test_dl_runs = {y_test_dl_runs.shape}")
print(f"y_train_dl_runs_categorical = {y_train_dl_runs_categorical.shape}")
print(f"y_test_dl_runs_categorical = {y_test_dl_runs_categorical.shape}")

x_train_dl_wickets = (127505, 8)
x_test_dl_wickets = (42502, 8)
x_train_dl_runs = (127505, 8)
x_test_dl_runs = (42502, 8)
y_train_dl_wicket = (127505,)
y_test_dl_wicket = (42502,)
y_train_dl_runs = (127505,)
y_test_dl_runs = (42502,)
y_train_dl_runs_categorical = (127505, 10)
y_test_dl_runs_categorical = (42502, 10)


In [None]:
model_runs = Sequential()
model_runs.add(Dense(units = 8, activation=relu))
model_runs.add(Dense(units = 16, activation=relu))
model_runs.add(Dense(units = 32, activation = relu))
model_runs.add(Dense(units = 64, activation= relu))
model_runs.add(Dense(units = 32, activation = relu))
model_runs.add(Dense(units = 16, activation = relu))
model_runs.add(Dense(units = 10, activation = softmax))

In [None]:
model_runs.compile(
    optimizer="adagrad",
    loss = categorical_crossentropy,
    metrics = ["accuracy"]
)

In [None]:
model_runs.fit(
    x = x_train_dl_runs_minmax,
    y = y_train_dl_runs_categorical,
    epochs = 100,
    validation_split = 0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

KeyboardInterrupt: ignored

# Time series Model

In [None]:
data_merged.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,score,player_dismissed,current_wickets,last_5_overs_runs,last_2_overs_wickets,current_batsman_runs,current_bowler_wicket,is_wicket,season,city,date,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,1,10,9,1,1,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
1,1,1,10,9,1,2,112,391,379,0,0,0,0,0,0,0,0,0,0,,0.0,0,0.0,0,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
2,1,1,10,9,1,3,112,391,379,0,0,0,0,0,0,4,0,4,4,,0.0,4,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
3,1,1,10,9,1,4,112,391,379,0,0,0,0,0,0,0,0,0,4,,0.0,4,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong
4,1,1,10,9,1,5,112,391,379,0,2,0,0,0,0,0,2,2,6,,0.0,6,0.0,4,0.0,0,2017,Hyderabad,2017-04-05,9,field,normal,0,10,35,0,224,28,AY Dandekar,NJ Llong


In [None]:
df = data_merged[data_merged["batsman"] == 112]
#df1 = df[df["season"] == 2019]
ids = df["match_id"].unique()
seasonwise_runs = []
for id in ids:
  df2 = df[df["match_id"] == id]
  runs = max(df2["current_batsman_runs"])
  seasonwise_runs.append(runs)

seasonwise_runs

[14,
 76,
 49,
 26,
 70,
 4,
 43,
 51,
 126,
 30,
 40,
 6,
 69,
 37,
 51,
 36,
 21,
 36,
 4,
 15,
 6,
 57,
 33,
 107,
 4,
 33,
 0,
 6,
 31,
 0,
 5,
 1,
 54,
 45,
 51,
 77,
 7,
 2,
 3,
 13,
 1,
 21,
 29,
 109,
 21,
 8,
 14,
 15,
 79,
 7,
 3,
 21,
 77,
 61,
 0,
 15,
 1,
 7,
 51,
 66,
 8,
 13,
 4,
 40,
 44,
 2,
 32,
 8,
 58,
 0,
 65,
 61,
 6,
 12,
 55,
 43,
 34,
 59,
 90,
 4,
 53,
 57,
 21,
 28,
 91,
 9,
 58,
 61,
 4,
 24,
 17,
 81,
 52,
 6,
 58,
 13,
 90,
 74,
 59,
 0,
 92,
 24,
 48,
 11,
 46,
 52,
 73,
 18,
 28,
 93,
 69,
 88,
 71,
 101,
 14,
 17,
 74,
 51,
 51,
 70,
 59,
 40,
 91]

In [None]:
minmax_scalar = MinMaxScaler()
seasonwise_runs_scalar = minmax_scalar.fit_transform(seasonwise_runs)
seasonwise_runs_scalar

array([[0.11111111],
       [0.6031746 ],
       [0.38888889],
       [0.20634921],
       [0.55555556],
       [0.03174603],
       [0.34126984],
       [0.4047619 ],
       [1.        ],
       [0.23809524],
       [0.31746032],
       [0.04761905],
       [0.54761905],
       [0.29365079],
       [0.4047619 ],
       [0.28571429],
       [0.16666667],
       [0.28571429],
       [0.03174603],
       [0.11904762],
       [0.04761905],
       [0.45238095],
       [0.26190476],
       [0.84920635],
       [0.03174603],
       [0.26190476],
       [0.        ],
       [0.04761905],
       [0.24603175],
       [0.        ],
       [0.03968254],
       [0.00793651],
       [0.42857143],
       [0.35714286],
       [0.4047619 ],
       [0.61111111],
       [0.05555556],
       [0.01587302],
       [0.02380952],
       [0.1031746 ],
       [0.00793651],
       [0.16666667],
       [0.23015873],
       [0.86507937],
       [0.16666667],
       [0.06349206],
       [0.11111111],
       [0.119

In [None]:
generator = TimeseriesGenerator(seasonwise_runs_scalar, seasonwise_runs_scalar, length=3, batch_size=1)
print(len(seasonwise_runs), len(generator))

123 120


In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(128, activation="relu", return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(LSTM(128, activation="relu"))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(32, activation="relu"))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1, activation="relu"))

In [None]:
model_lstm.compile(
    loss = "mse",
    optimizer="adam"
)

In [None]:
'''x = range(len(ids))
x = np.array(x)
x = x + 1
x = x.reshape((-1, 1))
seasonwise_runs = np.array(seasonwise_runs)
seasonwise_runs = seasonwise_runs.reshape((-1, 1))'''
model_lstm.fit(
    generator,
    epochs = 100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f7411465b90>

In [None]:
model_lstm.compile(
    optimizer = "adam",
    loss = CategoricalCrossentropy
)

In [None]:
trace = go.Line(
    x = np.arange(len(ids)) + 1,
    y = seasonwise_runs
)

Layout = go.Layout(
    title = "Runs in season 2019",
    xaxis = dict(title = "ids",
                 tickmode = "linear"),
    yaxis = dict(title = "runs"),
)

plot = go.Figure(
    data = [trace],
    layout = Layout
)

plot.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [None]:
seasons = sorted(data_merged["season"].unique())
#batsmen = data_merged["batsman"].unique()
seasonwise_runs = {}
for season in seasons:
  df = data_merged[data_merged["season"] == season]
  df1 = df

[2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]