In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('Most sixes in an innings in PSL.csv')
df 

Unnamed: 0,Player,Runs,Balls,4s,6s,SR,Team,Opposition,Ground,Match Date
0,BR Dunk,99*,40,3,12,247.5,Qalandars,v Kings,Lahore,8 Mar 2020
1,BR Dunk,93,43,3,10,216.27,Qalandars,v Gladiators,Lahore,3 Mar 2020
2,Umar Akmal,93,40,6,8,232.5,Qalandars,v Gladiators,Dubai (DSC),8 Feb 2016
3,Sharjeel Khan,117,62,12,8,188.7,United,v Zalmi,Dubai (DSC),21 Feb 2016
4,KP Pietersen,88*,42,3,8,209.52,Gladiators,v Qalandars,Sharjah,18 Feb 2017
5,Kamran Akmal,77,27,5,8,285.18,Zalmi,v Kings,Lahore,21 Mar 2018
6,CA Ingram,127*,59,12,8,215.25,Kings,v Gladiators,Sharjah,24 Feb 2019
7,CA Lynn,113*,55,12,8,205.45,Qalandars,v Sultans,Lahore,15 Mar 2020
8,SR Watson,79,47,4,7,168.08,United,v Qalandars,Sharjah,10 Feb 2016
9,Sharjeel Khan,79*,43,5,7,183.72,United,v Qalandars,Sharjah,10 Feb 2016


In [3]:
df.isnull().sum()

Player        0
Runs          0
Balls         0
4s            0
6s            0
SR            0
Team          0
Opposition    0
Ground        0
Match Date    0
dtype: int64

In [4]:

df['Runs'] = df['Runs'].astype(str).str.replace('*', '', regex=False).astype(int)
df['mostly_sixes'] = (df['6s'] * 6) > (0.5 * df['Runs'])
features = df[['Balls', '4s', '6s', 'SR']]
target = df['mostly_sixes']
df[['Runs', '6s', 'mostly_sixes']].head(), features.head(), target.head()


(   Runs  6s  mostly_sixes
 0    99  12          True
 1    93  10          True
 2    93   8          True
 3   117   8         False
 4    88   8          True,
    Balls  4s  6s      SR
 0     40   3  12  247.50
 1     43   3  10  216.27
 2     40   6   8  232.50
 3     62  12   8  188.70
 4     42   3   8  209.52,
 0     True
 1     True
 2     True
 3    False
 4     True
 Name: mostly_sixes, dtype: bool)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy, report


(0.7272727272727273,
 '              precision    recall  f1-score   support\n\n       False       0.60      0.75      0.67         4\n        True       0.83      0.71      0.77         7\n\n    accuracy                           0.73        11\n   macro avg       0.72      0.73      0.72        11\nweighted avg       0.75      0.73      0.73        11\n')

In [6]:
# check model accuracy
accuracy_score(y_test, y_pred)

# using plotly to visualize the model accuracy
import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(
    header=dict(values=['Metric', 'Value']),
    cells=dict(values=[['Accuracy', 'Precision', 'Recall', 'F1'], [accuracy, 0.86, 0.86, 0.86]]))
])

fig.show()