<a href="https://colab.research.google.com/github/izaskunsas/izaskunthesis2025/blob/main/Prediction_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#***PREDICTION ANALYSIS***
## **SETTING THE SESSION**

In [None]:
!pip install catboost
!pip install optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import GridSearchCV
import shap

In [None]:
data = pd.read_csv("/content/drive/MyDrive/THESIS/Code/data/data_FINAL", index_col=0)

In [None]:
data = data[~data['league'].isin(['ukraine_league', 'finland_league', 'czechia_league', 'greece_league'])]

In [None]:
data.value_counts('league')

In [None]:
#SPLIT - Splitting by year and i'm not removing temporal componetns because
train_years = ['FIFA 15', 'FIFA 16', 'FIFA 17', 'FIFA 18', 'FIFA 19', 'FIFA 20', 'FIFA 21']
data_trainval = data[data['Year'].isin(train_years)]

val_years   = ['FIFA 22', 'FIFA 23']
data_val = data[data['Year'].isin(val_years)]

test_years  = ['FC 24', 'FC 25']
data_test = data[data['Year'].isin(test_years)]

#SEPARATE OUTPUT VARIABLE
X_trainval = data_trainval.drop(columns=['league', 'Year', 'name', 'team', 'nation', 'position'])
y_trainval = data_trainval['league']

X_val = data_val.drop(columns=['league', 'Year', 'name', 'team', 'nation', 'position'])
y_val = data_val['league']

X_test = data_test.drop(columns=['league', 'Year', 'name', 'team', 'nation', 'position'])
y_test = data_test['league']

#TRAIN SPLIT -- for hyperparameter tuning into train and validation, stratified by league so they are more or less equal in the tuning of the model
X_train, X_val_strat, y_train, y_val_strat = train_test_split(
    X_trainval,
    y_trainval,
    test_size=0.2,  # 20% for validation
    random_state=22,
    stratify=y_trainval
)

In [None]:
leagues = ['english_premier_league', 'spain_la_liga', 'italy_serie_a', 'france_ligue_1',
           'german_bundesliga', 'poland_league', 'nl_eredivisie', 'portugal_liga',
           'belgium_league', 'norway_league', 'sweden_league', 'denmark_superliga',
           'austria_bundesliga', 'switzerland_league', 'scotland_prem', 'ireland_league']

##**EVALUATION FUNCTIONS**

In [None]:
# CONFUSION MATRIX

def plot_confusion_matrix(model, X_val, y_val, title, labels= np.unique(y_train)):
    os.makedirs('/content/drive/MyDrive/THESIS/Code/plots/Prediction/', exist_ok=True)
    y_predicted = model.predict(X_val)
    cm = confusion_matrix(y_val, y_predicted)

    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels, annot_kws={"size": 8})
    plt.xlabel('Predicted label')
    plt.ylabel('True Label')
    #plt.title(title)
    plt.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/THESIS/Code/plots/Prediction/{title}.png")
    plt.show()
    plt.close()

    acc = accuracy_score(y_val, y_predicted)
    print('test_accuracy: %.3f' % (acc))
    return

In [None]:
def plot_confusion_matrix_filtered(model, X_val, y_val, title, labels=np.unique(y_val)):
    os.makedirs('/content/drive/MyDrive/THESIS/Code/plots/Prediction/', exist_ok=True)
    y_predicted = model.predict(X_val)
    cm = confusion_matrix(y_val, y_predicted)

    threshold = 100
    filter = np.where(cm > threshold, cm.astype(str), '')

    sns.heatmap(cm, annot=filter, cmap='Blues', fmt='', xticklabels=labels, yticklabels=labels,
                cbar=True, linewidths=0.5, linecolor='white', annot_kws={"size": 8})
    plt.xlabel('Predicted label')
    plt.ylabel('True Label')
    #plt.title(title)
    plt.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/THESIS/Code/plots/Prediction/{title}.png")
    plt.show()
    plt.close()

    acc = accuracy_score(y_val, y_predicted)
    print('test_accuracy: %.3f' % acc)

In [None]:
# PERFORMANCE MEASURES DATA

def calculate_metrics(model, X_val, y_val):
    # predict probabilities for the set
    y_predicted = model.predict(X_val)

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_val, y_predicted)
    print('Accuracy: %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(y_val, y_predicted, average=None)
    print('Precision:', precision)

    # recall: tp / (tp + fn)
    recall = recall_score(y_val, y_predicted, average=None)
    print('Recall:', recall)

    # weighted-f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_val, y_predicted, average=None)
    print('F1 score:', f1)

    return

In [None]:
#GRID SEARCH
def calculate_best_params(model, grid, X_train):
    model  = model();
    grid_search = GridSearchCV(estimator=model, param_grid=grid, cv = 3, verbose = 3, n_jobs=-1)

    grid_search.fit(X_train,y_train)

    print("Best Parameters:",grid_search.best_params_)
    print("Train Score:",grid_search.best_score_)
    print("Val Score:",grid_search.score(X_val_strat,y_val_strat))

##**BASELINE MODEL**
Multinominal Logistic Regression

In [None]:
baseline = LogisticRegression(
    multi_class='multinomial',
    solver = 'lbfgs',
    max_iter=1000,
    random_state=22,
)

baseline.fit(X_train, y_train)

In [None]:
calculate_metrics(baseline, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(baseline, X_val_strat, y_val_strat, title = "CM Baseline", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(baseline, X_val_strat, y_val_strat, title = "SIMPLE CM Baseline", labels=np.unique(y_train))

###Grid Search
Hyperparameter Tuning

In [None]:
log_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization
    'C': [0.001, 0.01, 0.1, 1, 10],               # Inverse of regularization strength
    'solver': ['saga'],
    'l1_ratio': [0, 0.5, 1],
    'max_iter': [5000]
}

calculate_best_params(LogisticRegression, log_grid)

# Best Parameters: {'C': 10, 'l1_ratio': 0, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'saga'}
# Train Score: 0.26744124064105573
# Val Score: 0.2700522608498069

### Optuna
(did not work)

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
import numpy as np

#https://www.kaggle.com/code/bextuychiev/no-bs-guide-to-hyperparameter-tuning-with-optuna
# Source/inspiration: https://www.kaggle.com/discussions/general/261870

def objective(trial):
    #Hyperparameters Logistic Regression
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet", None])
    solver = "saga"  # saga supports all penalties
    C = trial.suggest_loguniform("C", 1e-4, 1e2)
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    ####
    model = LogisticRegression(
        penalty=penalty,
        C=C,
        l1_ratio=l1_ratio,
        solver=solver,
        max_iter=5000,
        multi_class="multinomial"
    )

    #Cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best params:", study.best_params)
print("Best accuracy:", study.best_value)

##**SVM**

In [None]:
#SVM
svm_model = SVC(
    C=10,
    kernel = 'rbf',
    gamma = 0.01,
    random_state=22
)

svm_model.fit(X_train, y_train)

In [None]:
calculate_metrics(svm_model, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(svm_model, X_val_strat, y_val_strat, title = "CM SVM")

In [None]:
plot_confusion_matrix_filtered(svm_model, X_val_strat, y_val_strat, title = "SIMPLE CM SVM")

###Grid Search

In [None]:
#SVM - GridSearch
#https://www.kaggle.com/code/gorkemgunay/understanding-parameters-of-svm

svm_grid = {
    'C':[0.01,0.1,1,10],
    'kernel' : ["rbf","sigmoid"],
    'gamma' : [0.01,1,10,500]
}

calculate_best_params(SVC, svm_grid, X_train)

#Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
#Train Score: 0.40176234514046416
#Val Score: 0.4289868558799581

##**CatBoost**

In [None]:
#CATBOOST
#https://www.kaggle.com/code/prashant111/catboost-classifier-in-python#
#used the hyperparameters of (Malamatinos, Vrochidou & Papakostas, 2022)

cb_model = CatBoostClassifier(
    iterations = 40,
    learning_rate = 0.05,
    l2_leaf_reg = 3,
    depth = 6,
    loss_function = 'MultiClass'
)

cb_model.fit(X_train, y_train)

In [None]:
calculate_metrics(cb_model, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(cb_model, X_val_strat, y_val_strat, title = "CM CatBoost", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(cb_model, X_val_strat, y_val_strat, title = "SIMPLE CM CatBoost")

### Grid Search

In [None]:
cb_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7],
    'iterations': [100, 300],
    'border_count': [32, 64, 128],  # Number of splits for numeric features
    'loss_function':['MultiClass']
}
calculate_best_params(CatBoostClassifier, cb_grid, X_train)

#Best Parameters: {'border_count': 32, 'depth': 8, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'loss_function': 'MultiClass'}
#Train Score: 0.5321060896876636
#Val Score: 0.5699662673025474

In [None]:
cb_model_hypertuned = CatBoostClassifier(
    iterations = 300,
    learning_rate = 0.1,
    border_count = 32,
    l2_leaf_reg = 1,
    depth = 8,
    loss_function = 'MultiClass'
)

cb_model_hypertuned.fit(X_train, y_train)

In [None]:
calculate_metrics(cb_model_hypertuned, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(cb_model_hypertuned, X_val_strat, y_val_strat, title = "CM CatBoost Best", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(cb_model_hypertuned, X_val_strat, y_val_strat, title = "SIMPLE CM CatBoost Best")

##**RANDOM FOREST**
Best Performing Model

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=22,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [None]:
calculate_metrics(rf_model, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(rf_model, X_val_strat, y_val_strat, title = "CM Random Forest", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(rf_model, X_val_strat, y_val_strat, title = "SIMPLE CM Random Forest", labels=np.unique(y_train))

### Grid Search

In [None]:
#https://www.blog.trainindata.com/random-forest-with-grid-search/
rf_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

calculate_best_params(RandomForestClassifier, rf_grid, X_train)

#Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
#Train Score: 0.5606060606060606
#Val Score: 0.6274281726183553

###BEST PERFORMANCE

In [None]:
rf_model_best = RandomForestClassifier(
    n_estimators=150,
    max_depth=None,
    max_features = 'log2',
    min_samples_leaf = 1,
    min_samples_split = 2,
    random_state=22,
    n_jobs=-1
)

rf_model_best.fit(X_train, y_train)

In [None]:
calculate_metrics(rf_model_best, X_val_strat, y_val_strat)

In [None]:
plot_confusion_matrix(rf_model_best, X_val_strat, y_val_strat, title = "CM Random Forest Best", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(rf_model_best, X_val_strat, y_val_strat, title = "SIMPLE CM Random Forest Best", labels=np.unique(y_train))

###HOLD OUT SET

In [None]:
rf_model_test = RandomForestClassifier(
    n_estimators=150,
    max_depth=None,
    max_features = 'log2',
    min_samples_leaf = 1,
    min_samples_split = 2,
    random_state=22,
    n_jobs=-1
)

rf_model_test.fit(X_val, y_val)

In [None]:
calculate_metrics(rf_model_test, X_test, y_test)

In [None]:
plot_confusion_matrix(rf_model_test, X_test, y_test, title = "CM Random Forest Test", labels=np.unique(y_train))

In [None]:
plot_confusion_matrix_filtered(rf_model_test, X_test, y_test, title = "SIMPLE CM Random Forest Test", labels=np.unique(y_train))

###FEATURE IMPORTANCE
SHAP

In [None]:
#RANDOM FOREST FEATURE IMPORTANCE
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=22,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [None]:
explainer = shap.TreeExplainer(rf_model)

In [None]:
shap_values = explainer.shap_values(X_val_strat)

In [None]:
#SUMMARY PLOTS PER LEAGUE
shap_values_by_class = [shap_values[:, :, i] for i in range(shap_values.shape[2])]
for i, league in enumerate(leagues):
    print(f"Plotting SHAP summary for class: {league}")
    shap.summary_plot(shap_values_by_class[i], X_val_strat, show=False)
    plt.title(f"SHAP Summary Plot - {league}")
    plt.tight_layout()
    plt.savefig(f"shap_summary_{league.replace(' ', '_')}.png")
    plt.clf()

In [None]:
#SUMMARY PLOT -- OVERALL MEAN
#debugged by ChatGPT
shap_values_mean = np.mean(np.abs(shap_values), axis=2)  # shape = (n_samples, n_features)

shap.summary_plot(shap_values_mean, X_val_strat)

In [None]:
#SUMMARY PLOT -- OVERALL MAX
shap_values_max = np.max(np.abs(shap_values), axis=2)

shap.summary_plot(shap_values_max, X_val_strat)

In [None]:
#SUMMARY PLOT -- OVERALL SUM
shap_values_sum = np.sum(np.abs(shap_values), axis=2)

shap.summary_plot(shap_values_max, X_val_strat)

Evaluation of fewer features

In [None]:
#VERSION 1 - Subset 8 leagues
shap_feature_importance = np.abs(predicted_class_shap).mean(axis=0)
feature_importance_df = pd.DataFrame({
    'Feature': X_val_strat.columns,
    'Importance': shap_feature_importance
}).sort_values(by='Importance', ascending=False)

feature_importance_df.to_csv('feature_importance_SHAP.csv')

# --- Helper Function to Train with Top-N Features ---
def evaluate_rf_with_top_features(top_ns):
    best_results = []

    for top_n in top_ns:
        print(f"\nEvaluating top {top_n} features...")

        selected_features = feature_importance_df.head(top_n)['Feature'].tolist()


        X_train_filter = X_train[selected_features]
        X_val_filter = X_val[selected_features]

        leagues_to_keep = [
            'english_premier_league', 'spain_la_liga', 'italy_serie_a',
            'france_ligue_1', 'german_bundesliga', 'nl_eredivisie',
            'portugal_liga', 'scotland_prem'
        ]

        mask_train = y_train.isin(leagues_to_keep)
        mask_val = y_val.isin(leagues_to_keep)

        X_train_top = X_train_filter[mask_train].reset_index(drop=True)
        y_train_top = y_train[mask_train].reset_index(drop=True)

        X_val_top = X_val_filter[mask_val].reset_index(drop=True)
        y_val_top = y_val[mask_val].reset_index(drop=True)


        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
        grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_top, y_train_top)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val_top)
        acc = accuracy_score(y_val_top, y_pred)

        print(f"Top-{top_n} features | Accuracy: {acc:.4f} | Best Params: {grid.best_params_}")
        best_results.append((top_n, acc, grid.best_params_))

    return best_results


top_n_list = [5, 10, 15, 20, 25, 30, 35]
results = evaluate_rf_with_top_features(top_n_list)


results_df = pd.DataFrame(results, columns=['Top_N', 'Accuracy', 'Best_Params'])
print("\nSummary:\n", results_df)

In [None]:
#VERSION 2 - 16 leagues
# Get mean absolute SHAP value per feature
shap_feature_importance = np.abs(predicted_class_shap).mean(axis=0)
feature_importance_df = pd.DataFrame({
    'Feature': X_val_strat.columns,
    'Importance': shap_feature_importance
}).sort_values(by='Importance', ascending=False)

feature_importance_df.to_csv('feature_importance_SHAP.csv')

# --- Helper Function to Train with Top-N Features ---
def evaluate_rf_with_top_features(top_ns):
    best_results = []

    for top_n in top_ns:
        print(f"\nEvaluating top {top_n} features...")

        selected_features = feature_importance_df.head(top_n)['Feature'].tolist()

        # Apply feature mask
        X_train_filter = X_train[selected_features]
        X_val_filter = X_val[selected_features]

        # Train RF with grid search
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
        grid = GridSearchCV(RandomForestClassifier(random_state=22), param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_filter, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val_filter)
        acc = accuracy_score(y_val, y_pred)

        print(f"Top-{top_n} features | Accuracy: {acc:.4f} | Best Params: {grid.best_params_}")
        best_results.append((top_n, acc, grid.best_params_))

    return best_results

# Run evaluation
top_n_list = [5, 10, 15, 20, 25, 30, 35]
results = evaluate_rf_with_top_features(top_n_list)

# Display summary
results_df = pd.DataFrame(results, columns=['Top_N', 'Accuracy', 'Best_Params'])
print("\nSummary:\n", results_df)

Other plots

In [None]:
class_to_index = {name: i for i, name in enumerate(leagues)}
y_val_int = y_val_strat.map(class_to_index).to_numpy().reshape(-1, 1, 1)


predicted_classes = y_val_int.reshape(-1, 1, 1) #chatGPT
predicted_class_shap = np.take_along_axis(shap_values, predicted_classes, axis=2).squeeze(-1)

# Plot SHAP summary - chatGPT
shap.summary_plot(predicted_class_shap, X_val_strat, plot_type="bar")

## NOT PART OF THE THESIS
(additional experiments)

Other Feature Importance methods (permutation)

In [None]:
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
import time

import numpy as np

feature_names = list(X_train.columns)

start_time = time.time()
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
from sklearn.inspection import permutation_importance

start_time = time.time()
result = permutation_importance(
    rf_model, X_val, y_val, n_repeats=10, random_state=22, n_jobs=-1
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Permutation feature importance
from sklearn.inspection import permutation_importance
result = permutation_importance(rf_model, X_val, y_val, n_repeats=10, random_state=22, n_jobs=-1)
perm_imp_df = pd.DataFrame({'Feature': feature_names, 'Permutation Importance': result.importances_mean}).sort_values('Permutation Importance', ascending=False)
print(perm_imp_df)

In [None]:
#Permutation - Mean Decrease Accuracy
importances = []
initial_accuracy = accuracy_score(y_val_strat, rf_model.predict(X_val_strat))  # initial accuracy

for i in range(X_val_strat.shape[1]):
    X_val_copy = X_val_strat.copy()
    X_val_copy.iloc[:, i] = np.random.permutation(X_val_copy.iloc[:, i].values)
    shuff_accuracy = accuracy_score(y_val_strat, rf_model.predict(X_val_copy))
    importances.append(initial_accuracy - shuff_accuracy)

accuracy_df = pd.DataFrame({
    'Feature': feature_names,
    'Decrease in Accuracy': importances
}).sort_values('Decrease in Accuracy', ascending=False)

print(accuracy_df)

plt.figure(figsize=(8, 4))
plt.barh(accuracy_df['Feature'], accuracy_df['Decrease in Accuracy'], color='skyblue')
plt.xlabel('Mean Decrease Accuracy')
plt.title('Feature Importance - Mean Decrease Accuracy (All Features)')
plt.gca().invert_yaxis()
plt.show()

# Filtered and sorted -- features with importance above threshold
threshold = 0.015
filtered_df = accuracy_df[accuracy_df['Decrease in Accuracy'] > threshold] \
    .sort_values('Decrease in Accuracy', ascending=False)

plt.figure(figsize=(8, 4))
plt.barh(filtered_df['Feature'], filtered_df['Decrease in Accuracy'], color='orange')
plt.xlabel('Mean Decrease Accuracy')
plt.title(f'Feature Importance - Features with Decrease > {threshold}')
plt.gca().invert_yaxis()
plt.show()

In [None]:
def evaluate_rf_with_top_features(top_ns):
    best_results = []

    for top_n in top_ns:
        print(f"\nEvaluating top {top_n} features...")

        selected_features = accuracy_df.head(top_n)['Feature'].tolist()

        # mask
        X_train_filter = X_train[selected_features]
        X_val_filter = X_val[selected_features]

        # filter
        leagues_to_keep = [
            'english_premier_league', 'spain_la_liga', 'italy_serie_a',
            'france_ligue_1', 'german_bundesliga', 'nl_eredivisie',
            'portugal_liga', 'scotland_prem'
        ]

        mask_train = y_train.isin(leagues_to_keep)
        mask_val = y_val.isin(leagues_to_keep)

        X_train_top = X_train_filter[mask_train].reset_index(drop=True)
        y_train_top = y_train[mask_train].reset_index(drop=True)

        X_val_top = X_val_filter[mask_val].reset_index(drop=True)
        y_val_top = y_val[mask_val].reset_index(drop=True)

        # Train RF with grid search
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
        grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_top, y_train_top)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val_top)
        acc = accuracy_score(y_val_top, y_pred)

        print(f"Top-{top_n} features | Accuracy: {acc:.4f} | Best Params: {grid.best_params_}")
        best_results.append((top_n, acc, grid.best_params_))

    return best_results


top_n_list = [5, 10, 15, 20, 25, 30, 35]
results = evaluate_rf_with_top_features(top_n_list)


results_df = pd.DataFrame(results, columns=['Top_N', 'Accuracy', 'Best_Params'])
print("\nSummary:\n", results_df)

RF Evaluation with fewer features

In [None]:
# mask of selected features
selected_features = filtered_df['Feature'].tolist()

X_train_filter = X_train[selected_features]
X_val_filter = X_val[selected_features]

In [None]:
#Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
rf_model_filter = RandomForestClassifier(
    max_depth = None,
    max_features = 'sqrt',
    min_samples_leaf = 1,
    min_samples_split = 2,
    n_estimators = 150
)
rf_model_filter.fit(X_train_filter, y_train)

In [None]:
calculate_metrics(rf_model_filter, X_val_filter, y_val)

In [None]:
plot_confusion_matrix(rf_model_filter, X_val_filter, y_val, title = "Confusion Matrix Random Forest", labels=np.unique(y_train))

Hypertune Filter Model

In [None]:
#RANDOM FOREST -- Hypertune
#https://www.blog.trainindata.com/random-forest-with-grid-search/
def calculate_best_params(model, grid, X_train):
    model  = model();
    grid_search = GridSearchCV(estimator=model, param_grid=grid, cv = 3, verbose = 3, n_jobs=-1)

    grid_search.fit(X_train,y_train)

    print("Best Parameters:",grid_search.best_params_)
    print("Train Score:",grid_search.best_score_)
    print("Val Score:",grid_search.score(X_val_filter,y_val))

rf_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

calculate_best_params(RandomForestClassifier, rf_grid, X_train_filter)

#Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
#Train Score: 0.5380968999011225
#Val Score: 0.39417518897287684