In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("ipl.csv")
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


## About Problem Statement
- Here we have the ipl teams and here we are going to predict score on base of more than 5 over and run and wickets are occured on that particular over and bating team, bowling team and venue.

In [3]:
# first let's remove unnecessary variables

df.drop(columns = ["mid", "date", "batsman", "bowler", "striker", "non-striker"], inplace = True)

In [4]:
# so we have to predict runs so our target variable is runs

# now we have lots of teams of IPL from 2008 to 2018 but here we take only that team which currently playing

current_team = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

df = df.loc[(df.bat_team.isin(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad'])) & df.bowl_team.isin(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad'])]

In [5]:
df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,1,0,0.1,1,0,222
1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,1,0,0.2,1,0,222
2,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.2,2,0,222
3,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.3,2,0,222
4,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,2,0,0.4,2,0,222


In [6]:
# for prediction we want at least 5 overs data so that's here we train our model on above 5 overs data
df = df[df.overs > 5]

In [7]:
# let's convert text data into lower case

df.venue = df.venue.str.lower()
df.bat_team = df.bat_team.str.lower()
df.bowl_team = df.bowl_team.str.lower()

In [8]:
# let's check bats and bowl team

print(df.bat_team.unique())
print(df.bowl_team.unique())

['kolkata knight riders' 'chennai super kings' 'rajasthan royals'
 'mumbai indians' 'kings xi punjab' 'royal challengers bangalore'
 'delhi daredevils' 'sunrisers hyderabad']
['royal challengers bangalore' 'kings xi punjab' 'delhi daredevils'
 'rajasthan royals' 'mumbai indians' 'chennai super kings'
 'kolkata knight riders' 'sunrisers hyderabad']


In [9]:
# In 2020 IPL "delhi daredevils" becomes "delhi capitals" so let's replace it

df.bat_team.replace("delhi daredevils", "delhi capitals", inplace = True)
df.bowl_team.replace("delhi daredevils", "delhi capitals", inplace = True)

In [10]:
# let's check bats and bowl team

print(df.bat_team.unique())
print(df.bowl_team.unique())

['kolkata knight riders' 'chennai super kings' 'rajasthan royals'
 'mumbai indians' 'kings xi punjab' 'royal challengers bangalore'
 'delhi capitals' 'sunrisers hyderabad']
['royal challengers bangalore' 'kings xi punjab' 'delhi capitals'
 'rajasthan royals' 'mumbai indians' 'chennai super kings'
 'kolkata knight riders' 'sunrisers hyderabad']


In [11]:
df.bat_team.unique() in df.bowl_team.unique()

True

In [12]:
# let's get unique teams and venue

venue = df.venue.unique()
team = df.bat_team.unique()

print("Venue: ", venue)
print("\n\n Teams: ", team)

Venue:  ['m chinnaswamy stadium' 'punjab cricket association stadium, mohali'
 'feroz shah kotla' 'wankhede stadium' 'sawai mansingh stadium'
 'ma chidambaram stadium, chepauk' 'eden gardens'
 'dr dy patil sports academy' 'newlands' "st george's park" 'kingsmead'
 'supersport park' 'buffalo park' 'new wanderers stadium'
 'de beers diamond oval' 'outsurance oval' 'brabourne stadium'
 'sardar patel stadium, motera'
 'himachal pradesh cricket association stadium'
 'subrata roy sahara stadium' 'rajiv gandhi international stadium, uppal'
 'shaheed veer narayan singh international stadium'
 'jsca international stadium complex' 'sheikh zayed stadium'
 'sharjah cricket stadium' 'dubai international cricket stadium'
 'barabati stadium' 'maharashtra cricket association stadium'
 'dr. y.s. rajasekhara reddy aca-vdca cricket stadium'
 'punjab cricket association is bindra stadium, mohali'
 'holkar cricket stadium']


 Teams:  ['kolkata knight riders' 'chennai super kings' 'rajasthan royals'
 'mumb

In [13]:
# let's encode text data using one hot encoding

# here i' going to encode bat_team and bowl_team using get_dummies
dummy_team = pd.get_dummies(df.drop(columns = "venue"), drop_first = True)
# in dummy team data frame we removed first column for preventing dumy variable trap
# and removed teams are bat_team_chennai super kings and bowl_team_chennai super kings 

# now let's encode the venue
dummy_venue =  pd.get_dummies(df.venue, drop_first = True)
# in venune dummy first column which is barabati stadium is removed bcz preventnig dummy variable trap

# now let's remove text columns from df and concat one hot encoded dummy columns

df = dummy_team

# df = pd.concat([df, dummy_team], axis = 1)
df = pd.concat([df, dummy_venue], axis = 1)

# so our final data for builiding model is look like this
df.head()

Unnamed: 0,runs,wickets,overs,runs_last_5,wickets_last_5,total,bat_team_delhi capitals,bat_team_kings xi punjab,bat_team_kolkata knight riders,bat_team_mumbai indians,...,"rajiv gandhi international stadium, uppal","sardar patel stadium, motera",sawai mansingh stadium,shaheed veer narayan singh international stadium,sharjah cricket stadium,sheikh zayed stadium,st george's park,subrata roy sahara stadium,supersport park,wankhede stadium
32,61,0,5.1,59,0,222,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33,61,1,5.2,59,1,222,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
34,61,1,5.3,59,1,222,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
35,61,1,5.4,59,1,222,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
36,61,1,5.5,58,1,222,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df.columns

Index(['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'total',
       'bat_team_delhi capitals', 'bat_team_kings xi punjab',
       'bat_team_kolkata knight riders', 'bat_team_mumbai indians',
       'bat_team_rajasthan royals', 'bat_team_royal challengers bangalore',
       'bat_team_sunrisers hyderabad', 'bowl_team_delhi capitals',
       'bowl_team_kings xi punjab', 'bowl_team_kolkata knight riders',
       'bowl_team_mumbai indians', 'bowl_team_rajasthan royals',
       'bowl_team_royal challengers bangalore',
       'bowl_team_sunrisers hyderabad', 'brabourne stadium', 'buffalo park',
       'de beers diamond oval', 'dr dy patil sports academy',
       'dr. y.s. rajasekhara reddy aca-vdca cricket stadium',
       'dubai international cricket stadium', 'eden gardens',
       'feroz shah kotla', 'himachal pradesh cricket association stadium',
       'holkar cricket stadium', 'jsca international stadium complex',
       'kingsmead', 'm chinnaswamy stadium', 'ma chidamba

In [15]:
dummy_team

Unnamed: 0,runs,wickets,overs,runs_last_5,wickets_last_5,total,bat_team_delhi capitals,bat_team_kings xi punjab,bat_team_kolkata knight riders,bat_team_mumbai indians,bat_team_rajasthan royals,bat_team_royal challengers bangalore,bat_team_sunrisers hyderabad,bowl_team_delhi capitals,bowl_team_kings xi punjab,bowl_team_kolkata knight riders,bowl_team_mumbai indians,bowl_team_rajasthan royals,bowl_team_royal challengers bangalore,bowl_team_sunrisers hyderabad
32,61,0,5.1,59,0,222,0,0,1,0,0,0,0,0,0,0,0,0,1,0
33,61,1,5.2,59,1,222,0,0,1,0,0,0,0,0,0,0,0,0,1,0
34,61,1,5.3,59,1,222,0,0,1,0,0,0,0,0,0,0,0,0,1,0
35,61,1,5.4,59,1,222,0,0,1,0,0,0,0,0,0,0,0,0,1,0
36,61,1,5.5,58,1,222,0,0,1,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75884,106,9,18.1,29,4,107,0,0,1,0,0,0,0,0,0,0,1,0,0,0
75885,107,9,18.2,29,4,107,0,0,1,0,0,0,0,0,0,0,1,0,0,0
75886,107,9,18.3,28,4,107,0,0,1,0,0,0,0,0,0,0,1,0,0,0
75887,107,9,18.4,24,4,107,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [16]:
# let's create feature matrix X and response vector y

X = df.drop(columns = "total")
y = df.total

In [17]:
# let's divide our data into training and testing data set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#### it's model building time

In [18]:
# here is regression problem 
# so here we check linear, ridge and lasso regression model for building our model and find best model among them

# Linear Regression

from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
linear_regression.score(X_test, y_test)

0.6806234438739456

In [20]:
# let's use Ridge, Lasso and random forest regression if there is any overfitting are available Ridge and Lasso regression will
# removed it

# here we are going to hyper parameter tuning for our ridge and lasso algorithm

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

algos = {
    "Ridge": {
        "model": Ridge(),
        "params": {
            "alpha": [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30 , 40]
        }
    },
    "Lasso": {
        "model": Lasso(),
        "params": {
            "alpha": [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 40]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [int(x) for x in np.linspace(100, 1200, 12)],
            "max_features": ["auto", "sqrt"],
            "max_depth": [int(x) for x in np.linspace(5, 30, 6)],
            "min_samples_split": [2, 5, 10, 15, 100],
            "min_samples_leaf": [1, 2, 5, 10]
        }
    }
}

best_models = {}
score = []
for model_name, values in algos.items():
    rscv = RandomizedSearchCV(values["model"], values["params"], cv = 5, n_jobs = -1)
    rscv.fit(X_train, y_train)
    best_models[model_name] = rscv
    score.append({
        "Model": model_name,
        "Best Parameters": rscv.best_params_,
        "Best Score": rscv.best_score_
    })
    
pd.DataFrame(score)

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Model,Best Parameters,Best Score
0,Ridge,{'alpha': 0.01},0.673266
1,Lasso,{'alpha': 1e-08},0.673266
2,Random Forest,"{'n_estimators': 400, 'min_samples_split': 2, ...",0.856804


In [21]:
# now let's test ridge lasso and random forest regressor model on our test data set

print("Ridge: ", best_models["Ridge"].score(X_test, y_test))
print("Lasso: ", best_models["Lasso"].score(X_test, y_test))
print("Random Forest", best_models["Random Forest"].score(X_test, y_test))

Ridge:  0.6806236768292447
Lasso:  0.6806234448109459
Random Forest 0.8813725189065688


In [25]:
# here Random Forest Regression give best result with 88.12% accuracy and best parameters are
# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 5,
#  'max_features': 'sqrt',
#  'max_depth': 25}

# so we take Random Forest Regression for build this model
import time

start_time = time.time()
final_model = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 5,
                                    max_depth = 25, max_features = "sqrt")
final_model.fit(X, y)

print("Random Forest Regression take ", time.time() - start_time, " to train our model.")

Random Forest Regression take  18.861131191253662  to train our model.


In [42]:
# let's make a prediction function for taking user input
import numpy as np
def predict_score(runs, wickets, overs, runs_last_5, wickets_last_5, bat_team, bowl_team , venue):
    X_pred = np.zeros(len(X.columns))
    
    if bat_team != "chennai super kings":
        # because we removed first columns for prevent dummy variable trap
        # and first column of bat_team was chennai super kings
        
        bat_team_index = np.where(X.columns == "bat_team_" + bat_team)[0][0]
    
    if bowl_team != "chennai super kings":
        # because we removed first columns for prevent dummy variable trap
        # and first column of bowl_team was chennai super kings
        
        bowl_team_index = np.where(X.columns == "bowl_team_" + bowl_team)[0][0]
        
    if venue != "barabati stadium":
        # because we removed first columns for prevent dummy variable trap
        # and first column of venue was barabati stadium
        venue_index = np.where(X.columns == venue)[0][0]
    
    numeric_columns = [runs, wickets, overs, runs_last_5, wickets_last_5]
    for i in range(5):
        X_pred[i] = numeric_columns[i]
        
    
    result = final_model.predict([X_pred])
    return result
    

In [44]:
predict_score(36, 2, 7.3, 32, 4, "delhi capitals", "mumbai indians", "m chinnaswamy stadium")

array([153.17778711])

In [28]:
# let's export some useful stuff for finding prediction on frontend

# let's save model
import pickle

with open("ipl.pickle", "wb")as f:
    pickle.dump(final_model, f)

In [29]:
# let's save columns, venue, team 
# we save it 3 into json file

import json

features = {"columns": X.columns.to_list(), "team": list(team), "venue": list(venue)}

with open("features.json", "w") as f:
    json.dump(features, f)