In [1]:
# Importing standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# We want our plots to appear inside the notebook
%matplotlib inline   

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Models Evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [2]:
df = pd.read_csv("verify_data_v2.csv")
df

Unnamed: 0.1,Unnamed: 0,ID,SEASON,CITY,CITY_Corrrected,TEAM1,TEAM1_MODEL,TEAM2,TEAM2_MODEL,TOSS_WINNER,...,TOSS_DECISION,RESULT,WINNER,WIN_BY_RUNS,TOSS_WINNER_MODEL,WINNER_MODEL,TOSS_DECISION_MODEL,CITY_MODEL,WIN_RATE_TEAM1,WIN_RATE_TEAM2
0,0,60,2008,Bangalore,RCB,KKR,1,RCB,0,RCB,...,field,normal,KKR,140,0,1,0,0,0.000000,0.000000
1,1,61,2008,Chandigarh,KXIP,CSK,1,KXIP,0,CSK,...,bat,normal,CSK,33,1,1,1,0,0.000000,0.000000
2,2,62,2008,Delhi,DD,DD,1,RR,0,RR,...,bat,normal,DD,0,0,1,1,1,0.000000,0.000000
3,3,63,2008,Mumbai,MI,RCB,1,MI,0,MI,...,bat,normal,RCB,0,0,1,1,0,0.000000,0.000000
4,4,64,2008,Kolkata,KKR,KKR,1,SRH,0,SRH,...,bat,normal,KKR,0,0,1,1,1,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,742,11346,2019,Mohali,KXIP,CSK,1,KXIP,0,KXIP,...,field,normal,KXIP,0,0,0,0,0,0.622642,0.465517
743,743,11347,2019,Mumbai,MI,KKR,1,MI,0,MI,...,field,normal,MI,0,0,0,0,0,0.519774,0.576087
744,744,11412,2019,Chennai,CSK,CSK,1,MI,0,CSK,...,bat,normal,MI,0,1,0,1,1,0.618750,0.578378
745,745,11413,2019,Visakhapatnam,Visakhapatnam,SRH,1,DD,0,DD,...,field,normal,DD,0,0,0,0,2,0.478022,0.432749


In [3]:
df.drop(["Unnamed: 0", "ID", "TEAM1", "TEAM2", "CITY", "TOSS_DECISION", "WIN_BY_RUNS", "CITY_Corrrected", "TOSS_WINNER", "SEASON", "WINNER"], axis=1, inplace=True)
df

Unnamed: 0,TEAM1_MODEL,TEAM2_MODEL,TEAM1_STRENGTH,TEAM1_CUMU_STRENGTH,TEAM2_STRENGTH,TEAM2_CUMU_STRENGTH,RESULT,TOSS_WINNER_MODEL,WINNER_MODEL,TOSS_DECISION_MODEL,CITY_MODEL,WIN_RATE_TEAM1,WIN_RATE_TEAM2
0,1,0,260.615385,221.184732,226.714286,279.914873,normal,0,1,0,0,0.000000,0.000000
1,1,0,276.687500,258.815972,311.466667,229.620875,normal,1,1,1,0,0.000000,0.000000
2,1,0,300.785714,301.876082,341.562500,248.544305,normal,0,1,1,1,0.000000,0.000000
3,1,0,226.714286,279.914873,289.785714,333.847403,normal,0,1,1,0,0.000000,0.000000
4,1,0,260.615385,221.184732,265.928571,248.200595,normal,0,1,1,1,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,1,0,331.588235,305.500955,308.142857,257.462276,normal,0,0,0,0,0.622642,0.465517
743,1,0,309.357143,287.215136,333.875000,295.880335,normal,0,0,0,0,0.519774,0.576087
744,1,0,331.588235,305.500955,333.875000,295.880335,normal,1,0,1,1,0.618750,0.578378
745,1,0,320.400000,301.054683,328.312500,277.032292,normal,0,0,0,2,0.478022,0.432749


In [4]:
df['RESULT'] = df['RESULT'].map({'normal': 1, 'tie': 0})
df

Unnamed: 0,TEAM1_MODEL,TEAM2_MODEL,TEAM1_STRENGTH,TEAM1_CUMU_STRENGTH,TEAM2_STRENGTH,TEAM2_CUMU_STRENGTH,RESULT,TOSS_WINNER_MODEL,WINNER_MODEL,TOSS_DECISION_MODEL,CITY_MODEL,WIN_RATE_TEAM1,WIN_RATE_TEAM2
0,1,0,260.615385,221.184732,226.714286,279.914873,1,0,1,0,0,0.000000,0.000000
1,1,0,276.687500,258.815972,311.466667,229.620875,1,1,1,1,0,0.000000,0.000000
2,1,0,300.785714,301.876082,341.562500,248.544305,1,0,1,1,1,0.000000,0.000000
3,1,0,226.714286,279.914873,289.785714,333.847403,1,0,1,1,0,0.000000,0.000000
4,1,0,260.615385,221.184732,265.928571,248.200595,1,0,1,1,1,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,1,0,331.588235,305.500955,308.142857,257.462276,1,0,0,0,0,0.622642,0.465517
743,1,0,309.357143,287.215136,333.875000,295.880335,1,0,0,0,0,0.519774,0.576087
744,1,0,331.588235,305.500955,333.875000,295.880335,1,1,0,1,1,0.618750,0.578378
745,1,0,320.400000,301.054683,328.312500,277.032292,1,0,0,0,2,0.478022,0.432749


In [5]:
X = df.drop("WINNER_MODEL", axis=1)
y = df["WINNER_MODEL"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=12)

# Naive Bayes

In [7]:
clf1 = GaussianNB()
clf1.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
clf1.score(X_train, y_train)

0.644891122278057

In [9]:
clf1.score(X_test, y_test)

0.6

# Random Forest Classifier

In [10]:
clf2 = RandomForestClassifier()
clf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
clf2.score(X_train, y_train)

1.0

In [12]:
clf2.score(X_test, y_test)

0.54

# KNeighbors Classifier

In [13]:
clf3 = KNeighborsClassifier()
clf3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [14]:
clf3.score(X_train, y_train)

0.7269681742043551

In [15]:
clf3.score(X_test, y_test)

0.6133333333333333

# Logistic Regression

In [16]:
clf4 = LogisticRegression()
clf4.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
clf4.score(X_train, y_train)

0.6314907872696818

In [18]:
clf4.score(X_test, y_test)

0.5866666666666667

## Hyperparameter tuning on RFC

In [19]:
# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10, 1000, 20),
           "max_depth": [None, 3, 5, 10, 15, 20],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [20]:
# Setup random seed
np.random.seed(12)

# Setup random hyperparameter search for RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=100,
                           verbose=True,
                           n_jobs=-1)

# Fit the random hyperparameter search model for RandomForestClassifier()
rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [21]:
rs_rf.best_params_

{'n_estimators': 970,
 'min_samples_split': 14,
 'min_samples_leaf': 1,
 'max_depth': 3}

In [23]:
rs_rf.score(X_train, y_train)

0.7085427135678392

In [24]:
rs_rf.score(X_test, y_test)

0.58

In [27]:
# Setup random seed
np.random.seed(12)

rf_gs_grid = {"n_estimators": np.arange(900, 1000, 20),
           "max_depth": [None, 3, 5, 8, 10, 12, 20],
           "min_samples_split": [12, 14, 16, 18],
           "min_samples_leaf": [1, 3, 5, 7, 9, 11]}

gs_rfc = GridSearchCV(RandomForestClassifier(),
                          param_grid=rf_gs_grid,
                          cv=5,
                          verbose=True,
                          n_jobs=-1)

# Fit grid hyperparameter search model
gs_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 840 candidates, totalling 4200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 31.3min
[Parallel(n_jobs=-1)]: Done 4200 out of 4200 | elapsed: 33.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [28]:
gs_rfc.best_params_

{'max_depth': 3,
 'min_samples_leaf': 7,
 'min_samples_split': 14,
 'n_estimators': 940}

In [29]:
gs_rfc.score(X_train, y_train)

0.7102177554438861

In [30]:
gs_rfc.score(X_test, y_test)

0.58

## Hyperparameter tuning on LogisticRegression

In [31]:
# Create a hyperparameter grid for LogisticRegression
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

In [32]:
# Tune LogisticRegression
np.random.seed(12)

# Setup random hyperparameeter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=20, n_jobs=None,
                   param_distributions={'C':...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+0

In [33]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 4.281332398719396}

In [34]:
rs_log_reg.score(X_train, y_train)

0.6348408710217756

In [35]:
rs_log_reg.score(X_test, y_test)

0.58

In [36]:
# Different hyperparameters for our LogisticRegression model
log_reg_grid = {"C": np.logspace(-4, 4, 30),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    0.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.00000000e-04, 1.8...
       2.04335972e-01, 3.85662042e-01, 7.27895384e-01, 1.37382380e+00,
       2.59294380e+00, 4.89390092e+00, 9.23670857e+00, 1.74332882e+01,
       3.29034456e+01, 6.21016942e+01, 1.17210230e+02, 2.21221629e+02,
       4.17531894e+02, 7.88046282e+02, 1.48735211e+03, 2.80721620e+03,
       5

In [37]:
gs_log_reg.best_params_

{'C': 4.893900918477489, 'solver': 'liblinear'}

In [38]:
gs_log_reg.score(X_train, y_train)

0.6314907872696818

In [39]:
gs_log_reg.score(X_test, y_test)

0.58