In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [2]:
# X = pd.read_csv('../../data/start_dataset.csv')
# y = pd.read_csv('../../data/y.csv')

X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_rating_attack_binned        3040 non-null   category
 1   avg_away_rating_attack_binned        3040 non-null   category
 2   avg_away_rating_defence_binned       3040 non-null   category
 3   avg_home_rating_defence_binned       3040 non-null   category
 4   points_home_binned                   3040 non-null   category
 5   home_weighted_wins_binned            3040 non-null   category
 6   away_weighted_wins_binned            3040 non-null   category
 7   ewm_home_team_goals_binned           3024 non-null   category
 8   ewm_away_team_goals_conceded_binned  3022 non-null   category
dtypes: category(9)
memory usage: 28.6 KB


In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
def evaluate_model(X_train, X_val, y_train, y_val):
    encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)

    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_val_encoded = encoder.transform(X_val)
    
    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0)
    y_pred = model.predict(X_val_encoded)
    return f1_score(y_val, y_pred, average='weighted')

In [5]:
evaluate_model(X_train, X_val, y_train, y_val)

0.5322813298157089

In [12]:
def evaluate_imputation(imputer, X_train, X_val, y_train, y_val, num_cols, cat_cols):
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)
    
    X_train_num = pd.DataFrame(imputer.fit_transform(X_train[num_cols]), columns=num_cols)
    X_val_num = pd.DataFrame(imputer.transform(X_val[num_cols]), columns=num_cols)

    if len(cat_cols) is not 0:
        simple_imputer_most_frequent = SimpleImputer(strategy='most_frequent')
        X_train_cat = pd.DataFrame(simple_imputer_most_frequent.fit_transform(X_train[cat_cols]), columns=cat_cols)
        X_val_cat = pd.DataFrame(simple_imputer_most_frequent.transform(X_val[cat_cols]), columns=cat_cols)

        X_train_imputed = pd.concat([X_train_num, X_train_cat], axis=1)
        X_val_imputed = pd.concat([X_val_num, X_val_cat], axis=1)
    else:
        X_train_imputed = X_train_num
        X_val_imputed = X_val_num

    f1 = evaluate_model(X_train_imputed, X_val_imputed, y_train, y_val)

    return f1

In [13]:
from sklearn.model_selection import ParameterGrid

# Define parameter grids for each imputer
knn_params = {
    'n_neighbors': [2, 5, 10],
    'weights': ['uniform', 'distance']
}

iterative_params = {
    'max_iter': [10, 20, 30],
    'initial_strategy': ['mean', 'median', 'most_frequent'],
    'n_nearest_features': [1, 3, 5, 7, 10]
}

# Imputers to test with their corresponding parameter grids
imputers_to_test = [
    ('KNN', KNNImputer, knn_params),
    ('Iterative', IterativeImputer, iterative_params)
]

f1_scores = {}

for name, imputer_class, params in imputers_to_test:
    for param_set in ParameterGrid(params):
        # Create an imputer instance with the current set of parameters
        imputer = imputer_class(**param_set)

        # Evaluate the imputation method
        f1 = evaluate_imputation(imputer, X_train, X_val, y_train, y_val, num_cols, cat_cols)
        
        # Store the F1 score with a name indicating the imputer and parameter set
        param_set_str = ', '.join(f'{key}={value}' for key, value in param_set.items())
        f1_scores[f'{name} ({param_set_str})'] = f1

# Print the results
for name, score in f1_scores.items():
    print(f"{name} Imputation: F1 Score = {score}")

KNN (n_neighbors=2, weights=uniform) Imputation: F1 Score = 0.4901854057809184
KNN (n_neighbors=2, weights=distance) Imputation: F1 Score = 0.4901854057809184
KNN (n_neighbors=5, weights=uniform) Imputation: F1 Score = 0.47736031996968437
KNN (n_neighbors=5, weights=distance) Imputation: F1 Score = 0.47739212357839955
KNN (n_neighbors=10, weights=uniform) Imputation: F1 Score = 0.4660618589943719
KNN (n_neighbors=10, weights=distance) Imputation: F1 Score = 0.4831382575158494
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=1) Imputation: F1 Score = 0.4848433865472411
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=3) Imputation: F1 Score = 0.4873945876086695
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=5) Imputation: F1 Score = 0.49791926173899503
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=7) Imputation: F1 Score = 0.4870576378499358
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=10) Impu

In [None]:
import os

output_dir = "../../data/imputed/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

X_imputed = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

X_imputed.to_csv(output_dir + "df.csv", index=False)
y.to_csv(output_dir + 'y.csv', index=False)