In [1]:
# Import our dependencies
import pandas as pd
from pathlib import Path

from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from imblearn.metrics import classification_report_imbalanced

## Model Starts

In [2]:
file_dir = 'Data/DB/'
data = pd.read_csv(f'{file_dir}match_stats_random.csv', low_memory=False)
data = data.drop(columns='index')
data.head()

Unnamed: 0,hand,age,ace,df,svpt,svGms,bpSaved,bpFaced,rank,result,1stIn(%),1stWon(%),2ndWon(%)
0,0,27.1,5,3,127,17,8,15,35,1,72,61,31
1,0,27.5,2,3,49,8,8,13,70,0,53,50,45
2,0,23.7,14,4,91,15,2,4,43,0,53,88,54
3,1,32.5,6,3,52,9,3,3,46,1,52,85,73
4,0,20.5,2,3,63,9,6,7,79,1,52,88,41


In [3]:
y = data['result']
X = data.drop(columns='result')

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(75000, 12)

In [5]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=1)

In [6]:
classifier.fit(X_train, y_train)

In [7]:
y_pred = classifier.predict(X_test)
results_data = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results_data.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,1,1
3,1,1
4,1,1
5,1,0
6,0,0
7,1,1
8,0,0
9,1,0


In [8]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.799


In [9]:

# Decision Tree Model

In [10]:
# Define features set
X = data.copy()
X = X.drop("result", axis=1)
X.head()

Unnamed: 0,hand,age,ace,df,svpt,svGms,bpSaved,bpFaced,rank,1stIn(%),1stWon(%),2ndWon(%)
0,0,27.1,5,3,127,17,8,15,35,72,61,31
1,0,27.5,2,3,49,8,8,13,70,53,50,45
2,0,23.7,14,4,91,15,2,4,43,53,88,54
3,1,32.5,6,3,52,9,3,3,46,52,85,73
4,0,20.5,2,3,63,9,6,7,79,52,88,41


In [11]:
# Define target vector
y = data["result"].values.reshape(-1, 1)
y[:5]

array([[1],
       [0],
       [0],
       [1],
       [1]])

In [12]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(75000, 12)
(25000, 12)
(75000, 1)
(25000, 1)


In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [15]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(80000, 12)
(20000, 12)
(80000, 1)
(20000, 1)


In [16]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [17]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [18]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [19]:
## Fitting the Decision Tree Model
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [20]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [22]:
# Model Evaluation
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9022,3395
Actual 1,3358,9225


Accuracy Score : 0.72988
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.73      0.73     12417
           1       0.73      0.73      0.73     12583

    accuracy                           0.73     25000
   macro avg       0.73      0.73      0.73     25000
weighted avg       0.73      0.73      0.73     25000



In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc = brfc.fit(X_train, y_train.ravel())

In [26]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7990126571792904

In [27]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 9858,  2525],
       [ 2499, 10118]])

In [28]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.80      0.80      0.80      0.80      0.64     12383
          1       0.80      0.80      0.80      0.80      0.80      0.64     12617

avg / total       0.80      0.80      0.80      0.80      0.80      0.64     25000



In [29]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec = eec.fit(X_train, y_train.ravel())

In [30]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7923668333413223

In [31]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 9768,  2615],
       [ 2575, 10042]])

In [32]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.79      0.80      0.79      0.79      0.63     12383
          1       0.79      0.80      0.79      0.79      0.79      0.63     12617

avg / total       0.79      0.79      0.79      0.79      0.79      0.63     25000

