In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [11]:
X = pd.read_csv('../../data/start_dataset.csv')
y = pd.read_csv('../../data/y.csv')

# X = pd.read_csv('../../data/binned/df.csv')
# y = pd.read_csv('../../data/binned/y.csv')

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Empty DataFrame


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
def evaluate_model(X_train, X_val, y_train, y_val):
    # encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).num_cols, use_cat_names=True, drop_invariant=True, return_df=True)

    # X_train_encoded = encoder.fit_transform(X_train, y_train)
    # X_val_encoded = encoder.transform(X_val)
    
    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='weighted')

In [14]:
evaluate_model(X_train, X_val, y_train, y_val)

0.647575516121548

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   player_rating_home_player_1   3040 non-null   int64  
 1   player_rating_home_player_2   3040 non-null   int64  
 2   player_rating_home_player_3   3040 non-null   int64  
 3   player_rating_home_player_4   3040 non-null   int64  
 4   player_rating_home_player_5   3040 non-null   int64  
 5   player_rating_home_player_6   3040 non-null   int64  
 6   player_rating_home_player_7   3040 non-null   int64  
 7   player_rating_home_player_8   3040 non-null   int64  
 8   player_rating_home_player_9   3040 non-null   int64  
 9   player_rating_home_player_10  3040 non-null   int64  
 10  player_rating_home_player_11  3040 non-null   int64  
 11  player_rating_away_player_1   3040 non-null   int64  
 12  player_rating_away_player_2   3040 non-null   int64  
 13  pla

In [16]:
def evaluate_imputation(imputer, X_train, X_val, y_train, y_val, num_cols, cat_cols):
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)
    
    X_train_num = pd.DataFrame(imputer.fit_transform(X_train[num_cols]), columns=num_cols)
    X_val_num = pd.DataFrame(imputer.transform(X_val[num_cols]), columns=num_cols)

    if len(cat_cols) is not 0:
        categorical_columns = ['avg_home_team_rating_x_ewm_shoton_away_binned', 'average_rating_away_binned', 'average_rating_home_binned', 'ewm_possession_home_binned', 'avg_away_team_rating_binned']

        for column in categorical_columns:
            X_train_num[column] = X_train[column].cat.add_categories('Missing').fillna('Missing')
            X_val_num[column] = X_val[column].cat.add_categories('Missing').fillna('Missing')

    f1 = evaluate_model(X_train_num, X_val_num, y_train, y_val)

    return f1

In [17]:
num_cols = X_train.select_dtypes(include='number').columns.tolist()
cat_cols = X_train.select_dtypes(exclude='number').columns.tolist()

In [18]:
from sklearn.model_selection import ParameterGrid

# Define parameter grids for each imputer
knn_params = {
    'n_neighbors': [2, 5, 10],
    'weights': ['uniform', 'distance']
}

iterative_params = {
    'max_iter': [10, 20, 30],
    'initial_strategy': ['mean', 'median', 'most_frequent'],
    'n_nearest_features': [1, 3, 5, 7, 10]
}

# Imputers to test with their corresponding parameter grids
imputers_to_test = [
    ('KNN', KNNImputer, knn_params),
    ('Iterative', IterativeImputer, iterative_params)
]

f1_scores = {}

for name, imputer_class, params in imputers_to_test:
    for param_set in ParameterGrid(params):
        # Create an imputer instance with the current set of parameters
        imputer = imputer_class(**param_set)

        # Evaluate the imputation method
        f1 = evaluate_imputation(imputer, X_train, X_val, y_train, y_val, num_cols, cat_cols)
        
        # Store the F1 score with a name indicating the imputer and parameter set
        param_set_str = ', '.join(f'{key}={value}' for key, value in param_set.items())
        f1_scores[f'{name} ({param_set_str})'] = f1

# Print the results
for name, score in f1_scores.items():
    print(f"{name} Imputation: F1 Score = {score}")

KNN (n_neighbors=2, weights=uniform) Imputation: F1 Score = 0.6395870535080364
KNN (n_neighbors=2, weights=distance) Imputation: F1 Score = 0.6395870535080364
KNN (n_neighbors=5, weights=uniform) Imputation: F1 Score = 0.6446970296879813
KNN (n_neighbors=5, weights=distance) Imputation: F1 Score = 0.6672692154770643
KNN (n_neighbors=10, weights=uniform) Imputation: F1 Score = 0.6520448644556279
KNN (n_neighbors=10, weights=distance) Imputation: F1 Score = 0.6188444708521674
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=1) Imputation: F1 Score = 0.6316723090883708
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=3) Imputation: F1 Score = 0.6474046910279336
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=5) Imputation: F1 Score = 0.6273095002339423
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=7) Imputation: F1 Score = 0.6420969905089159
Iterative (initial_strategy=mean, max_iter=10, n_nearest_features=10) Imputat

In [ ]:
knn = KNNImputer(weights='distance')
knn.fit(X_train)

In [19]:
import os

output_dir = "../../data/imputed/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

X_imputed = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

X_imputed.to_csv(output_dir + "df.csv", index=False)
y.to_csv(output_dir + 'y.csv', index=False)

NameError: name 'knn_imputer' is not defined