In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

rcParams["figure.figsize"] = (10, 6)
sns.set()

In [2]:
# importing filtered csv 

main_data = pd.read_csv("eda_feature_engineering.csv")

In [3]:
main_data.sample(5)

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_total_runs,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
119954,829803,1,1.6,13,1,0,Kings XI Punjab,Royal Challengers Bangalore,106,0,33,0,33,13,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab
29248,419114,2,10.5,67,6,0,Delhi Daredevils,Mumbai Indians,120,5,101,5,97,66,Feroz Shah Kotla,Mumbai Indians
58456,501271,2,9.1,56,1,0,Royal Challengers Bangalore,Chennai Super Kings,147,4,68,4,67,55,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings
41546,419165,2,15.5,99,1,0,Mumbai Indians,Chennai Super Kings,146,5,105,5,105,98,Dr DY Patil Sports Academy,Chennai Super Kings
161003,1136606,2,14.5,89,1,0,Chennai Super Kings,Sunrisers Hyderabad,180,2,141,2,139,88,Maharashtra Cricket Association Stadium,Chennai Super Kings


In [4]:
main_data.shape

(193096, 16)

In [5]:
# here in current year 2021 some of team name has been changed and some of team are not playing
# let's drop and rename teams

main_data.batting_team.unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals'], dtype=object)

In [6]:
# only taking more than 5 over records of match

main_data = main_data.loc[main_data.over >= 5]

In [7]:
# here batting team and balling team values are same

teams_2021 = ["Royal Challengers Bangalore", "Kolkata Knight Riders", "Rajasthan Royals", "Punjab Kings", "Sunrisers Hyderabad", "Delhi Capitals", "Mumbai Indians", "Chennai Super Kings"]

# here some of the teams name changed in year 2021 is 
# Delhi Daredevils becames Delhi Capitals
# Kings XI Punjab becames Punjab Kings

# let's rename this teams with current name

main_data.replace({"Delhi Daredevils": "Delhi Capitals", "Kings XI Punjab": "Punjab Kings"}, inplace=True)

In [8]:
# let's get those teams values which are not in year 2021 IPL
ignoring_team = list(filter(lambda x: x not in teams_2021, main_data.batting_team.unique()))

In [9]:
# let's drop only those teams data which are not in year 2021

for i in ignoring_team:
    main_data.drop(main_data.loc[(main_data.batting_team == i) | (main_data.bowling_team == i)].index, axis=0, inplace=True)
    
main_data = main_data.reset_index(drop=True)

#### Let's verifying 2021 teams with batting_team, bowling_team and winner

In [10]:
len(teams_2021)

8

In [11]:
main_data.batting_team.nunique()

8

In [12]:
main_data.bowling_team.nunique()

8

In [13]:
main_data.winner.nunique()

8

In [14]:
teams_2021.sort()

In [15]:
teams_2021

['Chennai Super Kings',
 'Delhi Capitals',
 'Kolkata Knight Riders',
 'Mumbai Indians',
 'Punjab Kings',
 'Rajasthan Royals',
 'Royal Challengers Bangalore',
 'Sunrisers Hyderabad']

In [16]:
teams_2021 == np.sort(main_data.batting_team.unique())

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [17]:
teams_2021 == np.sort(main_data.bowling_team.unique())

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [18]:
teams_2021 == np.sort(main_data.winner.unique())

array([ True,  True,  True,  True,  True,  True,  True,  True])

### Feature Encoding

In [19]:
# here batting_team and bowling_team are in string format and ML model can not accept
# string values so let's encode it into numeric with the help of winning team


# here i'm going to do label encoding base on counting of highest winning team
# here i'm ordering team base on winning count and making dict using team name as key and encoded values

encoded_teams = {k:v for v, k in enumerate(main_data.winner.value_counts().sort_values().index, 0)}
encoded_teams


{'Sunrisers Hyderabad': 0,
 'Delhi Capitals': 1,
 'Rajasthan Royals': 2,
 'Royal Challengers Bangalore': 3,
 'Punjab Kings': 4,
 'Kolkata Knight Riders': 5,
 'Chennai Super Kings': 6,
 'Mumbai Indians': 7}

In [20]:
# let's create a new data frame and map this encoded values with batting and bowling teams

df1 = main_data.copy()


df1.batting_team = df1.batting_team.map(encoded_teams)
df1.bowling_team = df1.bowling_team.map(encoded_teams)

In [21]:
df1.head()

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_total_runs,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
0,335982,1,5.1,33,1,0,5,3,222,0,61,0,60,32,M Chinnaswamy Stadium,Kolkata Knight Riders
1,335982,1,5.2,34,0,1,5,3,222,1,61,1,60,33,M Chinnaswamy Stadium,Kolkata Knight Riders
2,335982,1,5.3,35,0,0,5,3,222,1,61,1,60,34,M Chinnaswamy Stadium,Kolkata Knight Riders
3,335982,1,5.4,36,0,0,5,3,222,1,61,1,60,35,M Chinnaswamy Stadium,Kolkata Knight Riders
4,335982,1,5.5,37,0,0,5,3,222,1,61,1,60,36,M Chinnaswamy Stadium,Kolkata Knight Riders


In [22]:
# let's check the type of the df1 

df1.dtypes

id                       int64
inning                   int64
over                   float64
ball                     int64
total_runs               int64
is_wicket                int64
batting_team             int64
bowling_team             int64
final_total_runs         int64
wickets                  int64
runs                     int64
last_5_over_wickets      int64
last_5_over_runs         int64
last_5_over_balls        int64
venue                   object
winner                  object
dtype: object

In [23]:
# venue column have object data types 
# so let's encoded venue column

# here i'm doing one hot encoding for veune
encoded_venue = pd.get_dummies(df1.venue, drop_first=True)
# here i drop first venue value for prevent dummy variable trap when all values are 0 it indiicates
# first column which value was Barabati Stadium

encoded_venue.sample(3)

Unnamed: 0,Brabourne Stadium,Buffalo Park,De Beers Diamond Oval,Dr DY Patil Sports Academy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,Dubai International Cricket Stadium,Eden Gardens,Feroz Shah Kotla,Himachal Pradesh Cricket Association Stadium,Holkar Cricket Stadium,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
33860,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28666,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
95026,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Goal
- Here my goal is to predict score of batting team in IPL match
- Here target variable is final_total_runs

### Feature Selection

In [24]:
# here i'm taking over, batting_team, bowling_team, wickets, runs, balls, last_5_over_wickets, last_5_over_runs, las_5_over_bals, venue feaures
# and final_total_runs target variable

df2 = df1[["over", "wickets", "runs", "last_5_over_wickets", "last_5_over_runs","batting_team", "bowling_team", "final_total_runs"]].copy()


df2.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_total_runs
0,5.1,0,61,0,60,5,3,222
1,5.2,1,61,1,60,5,3,222
2,5.3,1,61,1,60,5,3,222
3,5.4,1,61,1,60,5,3,222
4,5.5,1,61,1,60,5,3,222


In [25]:
# here in df2 i don't have venue column for that i'm concatenate encoded_venue dataframe with df2

df2 = pd.concat([df2, encoded_venue], axis=1).copy()
df2.sample(3)


Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_total_runs,Brabourne Stadium,Buffalo Park,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
45603,10.4,4,49,4,49,0,2,136,0,0,...,1,0,0,0,0,0,0,0,0,0
3316,12.5,3,99,3,98,4,3,127,0,0,...,0,0,0,0,0,0,0,0,0,0
99842,12.2,5,83,5,83,7,0,149,0,0,...,0,0,0,0,1,0,0,0,0,0


### Model Building

In [26]:
# here let's creat feature matrix X and target vector y

X = df2.drop(columns="final_total_runs") # 38 features
y = df2.final_total_runs

In [27]:
# let's split the data into train and test part

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((88046, 36), (88046,)), ((22012, 36), (22012,)))

In [28]:
# scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor


algos = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge": {
        "model": Ridge(),
        "params": {
            "alpha":np.arange(0.1, 1, 0.01)
        }
    },
    "Lasso": {
        "model": Lasso(),
        "params": {
            "alpha": np.arange(0.1, 1, 0.01)
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            "criterion": ["mse", "friedman_mse"],
            "splitter": ["best", "random"],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "max_features": ["auto", "log2", "sqrt", None]            
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "criterion": ["mse", "friedman_mse"],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "max_features": ["auto", "log2", "sqrt", None]
        }
    },
    "Ada Boost": {
        "model": AdaBoostRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": np.arange(0.1, 1, 0.01),
            "loss": ['linear', 'square', 'exponential']
        }
    },
    "Gradient Boost": {
        "model": GradientBoostingRegressor(),
        "params": {
            "learning_rate": np.arange(0.1, 1, 0.01),
            "n_estimators": [100, 200, 300],
            "criterion": ['friedman_mse', 'mse'],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "max_features": ["auto", "log2", "sqrt", None],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "alpha": np.arange(0.1, 1, 0.01)
        }
    }
}

In [30]:
from sklearn.model_selection import RandomizedSearchCV
import time

start_time = time.time()
best_model = {}
best_model_details = []

for model_name, values in algos.items():
    rscv = RandomizedSearchCV(values["model"], values["params"], cv=5, n_iter=15, n_jobs=-1, verbose=2, random_state=4)
    rscv.fit(X_train, y_train)
    print("---fitted---")
    best_model[model_name] = rscv
    best_model_details.append({"Model Name": model_name, "Best Score": rscv.best_score_, "Best Parameters": rscv.best_params_})
    print(model_name)


print("--------------------------------------------------------")
print(f"it takes {(time.time() - start_time) / 60} minutes")
print("--------------------------------------------------------")

Fitting 5 folds for each of 1 candidates, totalling 5 fits




---fitted---
Linear Regression
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Ridge
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Lasso
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Decision Tree
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Random Forest
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Ada Boost
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Gradient Boost
--------------------------------------------------------
it takes 19.30253322124481 minutes
--------------------------------------------------------


In [31]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(best_model_details)

Unnamed: 0,Model Name,Best Score,Best Parameters
0,Linear Regression,0.526094,{}
1,Ridge,0.526093,{'alpha': 0.19999999999999996}
2,Lasso,0.498689,{'alpha': 0.19999999999999996}
3,Decision Tree,0.502078,"{'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_leaf_nodes': 90, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'friedman_mse'}"
4,Random Forest,0.671578,"{'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 20, 'criterion': 'mse'}"
5,Ada Boost,0.405421,"{'n_estimators': 300, 'loss': 'linear', 'learning_rate': 0.30999999999999994}"
6,Gradient Boost,0.845862,"{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 14, 'learning_rate': 0.6799999999999997, 'criterion': 'mse', 'alpha': 0.7099999999999996}"


In [32]:
test_model = []

for model_name, model in best_model.items():
    test_model.append({"Model Name": model_name, "Test Score": model.score(X_test, y_test)})

pd.DataFrame(test_model)

Unnamed: 0,Model Name,Test Score
0,Linear Regression,0.526871
1,Ridge,0.526864
2,Lasso,0.498622
3,Decision Tree,0.502346
4,Random Forest,0.685654
5,Ada Boost,0.405239
6,Gradient Boost,0.868947


In [33]:
# let's calculate error using MSE, RMSE and MAE for Gradient Boost algorithm

from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred_train = best_model["Gradient Boost"].predict(X_train)
y_pred_test = best_model["Gradient Boost"].predict(X_test)

train_mae = mean_absolute_error(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)
train_rmse = np.sqrt(train_mse)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)

print("-------- Training Data Error --------")
print("Mean Absolute Error:", train_mae)
print("Mean Squared Error:", train_mse)
print("Root Mean Squared Error:", train_rmse)
print("--------------------------------------")
print()
print("-------- Testing Data Error --------")
print("Mean Absolute Error:", test_mae)
print("Mean Squared Error:", test_mse)
print("Root Mean Squared Error:", test_rmse)
print("--------------------------------------")

-------- Training Data Error --------
Mean Absolute Error: 1.7070478325908822
Mean Squared Error: 8.118174922093411
Root Mean Squared Error: 2.8492411133656996
--------------------------------------

-------- Testing Data Error --------
Mean Absolute Error: 6.585268495874344
Mean Squared Error: 112.7489552759139
Root Mean Squared Error: 10.618331096547795
--------------------------------------


#### Gradient Boost algorithm is performed well for this problem statement
- Gradient Boost give 86.89% accuracy on test dataset
- Here 6.58 Mean Absolute error on test dataset

In [34]:
# let's save gradient boost model and scaler in the form of pickle file
# encoded_team and feature columns as JSON file for prediction purpose which i use in web app 

import pickle
import json

with open("model.pickle", "wb") as f:
    pickle.dump(best_model["Gradient Boost"], f)
    
with open("scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)
    
with open("encodedteams.json", "w") as f:
    json.dump(encoded_teams, f)
    
with open("columns.json", "w") as f:
    json.dump({"columns": list(X.columns)}, f)

[CV] END ..........................alpha=0.23999999999999994; total time=   0.1s
[CV] END ...........................alpha=0.3699999999999999; total time=   0.1s
[CV] END ..........................alpha=0.23999999999999994; total time=   2.0s
[CV] END ...........................alpha=0.8599999999999995; total time=   0.6s
[CV] END ...........................alpha=0.7099999999999996; total time=   0.7s
[CV] END ...........................alpha=0.5299999999999998; total time=   1.2s
[CV] END ..........................alpha=0.29999999999999993; total time=   2.0s
[CV] END ...........................alpha=0.3699999999999999; total time=   1.8s
[CV] END criterion=friedman_mse, max_depth=5, max_features=sqrt, max_leaf_nodes=40, min_samples_leaf=3, min_samples_split=10, splitter=random; total time=   0.1s
[CV] END criterion=friedman_mse, max_depth=10, max_features=auto, max_leaf_nodes=40, min_samples_leaf=2, min_samples_split=15, splitter=best; total time=   0.5s
[CV] END criterion=friedman_m