In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("PremierLeagueMatches.csv")

# Cleaning
df['Attendance'] = df['Attendance'].str.replace(',', '').astype(float)
df['HomeWin'] = df['Result'].apply(lambda x: 1 if x == 'H' else 0)

# Select features
features = ['homeXG', 'awayXG', 'Attendance']
target = 'HomeWin'

df_cleaned = df[features + [target]].dropna()

# Train-test split
X = df_cleaned[features]
y = df_cleaned[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Show 5 incorrect predictions
X_test_results = X_test.copy()
X_test_results["Actual"] = y_test
X_test_results["Predicted"] = y_pred
incorrect_preds = X_test_results[X_test_results["Actual"] != X_test_results["Predicted"]]
print("\n5 Incorrect Predictions:")
print(incorrect_preds.head(5))


Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.67       113
           1       0.60      0.52      0.56        96

    accuracy                           0.62       209
   macro avg       0.62      0.61      0.61       209
weighted avg       0.62      0.62      0.62       209


5 Incorrect Predictions:
      homeXG  awayXG  Attendance  Actual  Predicted
1009     2.2     1.5     61383.0       1          0
424      2.1     0.7     61706.0       1          0
459      0.4     0.5     60233.0       1          0
550      0.9     0.6     41651.0       0          1
362      1.6     2.0     31684.0       0          1
