# ML on cricket data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set(style='whitegrid')
data_dir='./datasets/'

In [2]:
df = pd.read_csv(data_dir+'data_full.csv')

In [3]:
df.head(2)

Unnamed: 0,date,team1,score1,wkts1,score1_at6,wkts1_at6,score1_at10,wkts1_at10,score1_at15,wkts1_at15,...,wkts2,score2_at6,wkts2_at6,score2_at10,wkts2_at10,score2_at15,wkts2_at15,toss_winner,toss_decision,winner
0,2005-02-17,New Zealand,214,5,58,4,89,4,141,5,...,10,52,2,92,3,123,7,Australia,bat,Australia
1,2005-06-13,England,179,8,50,2,93,2,124,5,...,10,31,7,65,7,79,10,England,bat,England


## Data Preparation
- Y will be win or loss so 0 and 1. (and it can only represented by win for 1 team)
- X will have score (run and wkt) at 6 score at 10, score at 15 and final score. one column for toss (0|1) (1 if won the toss by team1). one column for innings (0|1) ( 0 for first innings by team1 and 1 for second innings)

In [4]:
top_teams = ['Afghanistan', 'Australia', 'Bangladesh', 'England', 'India',
             'Ireland', 'New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka', 'West Indies']

def prepare_data(dff, teams=top_teams):
    # remove the bottom teams from df
    bottom_team_index = [i for i in dff.index if (dff.team1[i] not in top_teams or dff.team2[i] not in top_teams)]
    df = dff.drop(index=bottom_team_index) # donot use inplace=True here
    
    df['toss1']=(df.toss_winner==df.team1).astype(int)
    df['toss2']=(df.toss_winner==df.team2).astype(int)
    df['inn1']=[0]*df.shape[0]
    df['inn2']=[1]*df.shape[0]
    df['win1']=(df.winner==df.team1).astype(int)
    df['win2']=(df.winner==df.team2).astype(int)
    
    data = df[["score1", "wkts1", "score1_at6", "wkts1_at6", "score1_at10", "wkts1_at10",
              "score1_at15", "wkts1_at15", "score2", "wkts2", "score2_at6", "wkts2_at6",
              "score2_at10", "wkts2_at10", "score2_at15", "wkts2_at15", "toss1", "inn1", "win1"]]
    
    return data

In [5]:
data = prepare_data(df)
print (data.shape)
data.head(1)

(452, 19)


Unnamed: 0,score1,wkts1,score1_at6,wkts1_at6,score1_at10,wkts1_at10,score1_at15,wkts1_at15,score2,wkts2,score2_at6,wkts2_at6,score2_at10,wkts2_at10,score2_at15,wkts2_at15,toss1,inn1,win1
0,214,5,58,4,89,4,141,5,170,10,52,2,92,3,123,7,0,0,0


In [26]:
from sklearn.model_selection import train_test_split

y = data["win1"]
X = data.drop("win1", axis=1)
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
print (f"X_train.shape: {X_train.shape} X_test.shape: {X_test.shape}")

X_train.shape: (361, 18) X_test.shape: (91, 18)


In [27]:
from sklearn.model_selection import train_test_split

y = data["win1"]
#X = data.drop(["win1", axis=1)
to_drop = ["win1", "score1_at6", "wkts1_at6", "score1_at10", "wkts1_at10", "score1_at15", "wkts1_at15", "score2_at6", "wkts2_at6", "score2_at10", "wkts2_at10", "score2_at15", "wkts2_at15"]
X = data.drop(to_drop, axis=1)

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
print (f"X_train.shape: {X_train.shape} X_test.shape: {X_test.shape}")


X_train.shape: (361, 6) X_test.shape: (91, 6)


## Model

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier


In [29]:
clf_dict={"LR" : LogisticRegression(max_iter=1000),
          "XGB" : XGBClassifier(),
          "ADB": AdaBoostClassifier(n_estimators = 100)}

def train(clf_name):
    
    model = clf_dict[clf_name]
    model.fit(X_train, y_train)
    y_preds_train = model.predict(X_train)
    y_preds_test = model.predict(X_test)

    acc_train = accuracy_score(y_train, y_preds_train)
    acc_test  = accuracy_score(y_test,  y_preds_test)

    return (acc_train, acc_test)

#clf = LogisticRegression(max_iter=1000)
#train(clf)

In [None]:
train(clf)

In [30]:


res = []
for clf in ["LR", "XGB", "ADB"]:
    (acc_train, acc_test) = train(clf)
    res.append([clf, acc_train, acc_test])
    

res_df = pd.DataFrame(res, columns=["model", "accuracy train ", "accuracy test"])
res_df


Unnamed: 0,model,accuracy train,accuracy test
0,LR,0.523546,0.483516
1,XGB,1.0,0.483516
2,ADB,0.750693,0.450549
