In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier

In [20]:
international_results = pd.read_csv("results.csv", parse_dates=["date"])
international_results.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'results.csv'

In [22]:
international_results = international_results[international_results["date"].dt.year >= 2000]

uefa_teams = [
    "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", "Belgium", "Bosnia and Herzegovina",
    "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "England", "Estonia", "Faroe Islands",
    "Finland", "France", "Georgia", "Germany", "Gibraltar", "Greece", "Hungary", "Iceland", "Ireland",
    "Israel", "Italy", "Kazakhstan", "Kosovo", "Latvia", "Liechtenstein", "Lithuania", "Luxembourg",
    "Malta", "Moldova", "Monaco", "Montenegro", "Netherlands", "North Macedonia", "Northern Ireland",
    "Norway", "Poland", "Portugal", "Romania", "Russia", "San Marino", "Scotland", "Serbia", "Slovakia",
    "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine", "Wales"
]

euro_matches = international_results[
    international_results["home_team"].isin(uefa_teams) &
    international_results["away_team"].isin(uefa_teams)
]

euro_matches = euro_matches.reset_index(drop=True)

print(f"Processed {len(euro_matches)} UEFA matches from 2000 onwards.")

NameError: name 'international_results' is not defined

In [None]:
def get_result(row):
    if row['home_score'] > row['away_score']:
        return 2
    elif row['home_score'] < row['away_score']:
        return 0
    else:
        return 1

euro_matches['result'] = euro_matches.apply(get_result, axis=1)

In [None]:
K = 40
team_elos = {}

def get_elo(team):
    return team_elos.get(team, 1500)

home_elos = []
away_elos = []

for idx, row in euro_matches.iterrows():
    home = row['home_team']
    away = row['away_team']

    R_home = get_elo(home)
    R_away = get_elo(away)

    home_elos.append(R_home)
    away_elos.append(R_away)

    home_score = row['home_score']
    away_score = row['away_score']

    E_home = 1 / (1 + 10 ** ((R_away - R_home) / 400))
    E_away = 1 - E_home

    if home_score > away_score:
        S_home, S_away = 1, 0
    elif home_score < away_score:
        S_home, S_away = 0, 1
    else:
        S_home, S_away = 0.5, 0.5

    R_home_new = R_home + K * (S_home - E_home)
    R_away_new = R_away + K * (S_away - E_away)

    team_elos[home] = R_home_new
    team_elos[away] = R_away_new

euro_matches['home_elo'] = home_elos
euro_matches['away_elo'] = away_elos

In [None]:
euro_matches['elo_diff'] = euro_matches['home_elo'] - euro_matches['away_elo']
euro_matches['elo_sum'] = euro_matches['home_elo'] + euro_matches['away_elo']
euro_matches['neutral'] = euro_matches['neutral'].astype(int)  # Convert boolean to 0/1

In [None]:
feature_cols = ['home_elo', 'away_elo', 'elo_diff', 'neutral']
X = euro_matches[feature_cols]
y = euro_matches['result']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Logistic Regression Baseline
lr = LogisticRegression(max_iter=500, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr, zero_division=0))
print(confusion_matrix(y_test, y_pred_lr))

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

baseline_model = LogisticRegression(max_iter=500, random_state=42, class_weight=class_weight_dict)
baseline_model.fit(X_train, y_train)

y_pred_baseline = baseline_model.predict(X_test)

print("=== Logistic Regression with Class Weights ===")
print(classification_report(y_test, y_pred_baseline, zero_division=0))
print(confusion_matrix(y_test, y_pred_baseline))

In [None]:
print(y_train.value_counts())#checks for class imbalance

In [None]:
#XGBoost
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost:")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

In [None]:
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
)

xgb.fit(X_train, y_train, sample_weight=sample_weights)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost with Class Weights:")
print(classification_report(y_test, y_pred_xgb, zero_division=0))
print(confusion_matrix(y_test, y_pred_xgb))

In [None]:
class_counts = euro_matches['result'].value_counts().sort_index()
class_labels = ['Away Win (0)', 'Draw (1)', 'Home Win (2)']

plt.figure(figsize=(8, 5))
sns.barplot(x=class_labels, y=class_counts.values, palette='Set2')
plt.title('Class Distribution of Match Results')
plt.ylabel('Number of Matches')
plt.xlabel('Match Outcome')
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred_xgb)
labels = ['Away Win (0)', 'Draw (1)', 'Home Win (2)']

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - XGBoost')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
importances = xgb.feature_importances_
feature_names = X_train.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title("Feature Importance (XGBoost)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()