In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# X = pd.read_csv('../../data/start_dataset.csv')
# y = pd.read_csv('../../data/y.csv')

X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

In [6]:
simple_imputer_mean = SimpleImputer(strategy='mean')
df_simple_mean_train = pd.DataFrame(simple_imputer_mean.fit_transform(X_train[num_cols]), columns=X_train[num_cols].columns)
df_simple_mean_val = pd.DataFrame(simple_imputer_mean.transform(X_val[num_cols]), columns=X_val[num_cols].columns)
# df_simple_mean_tst = pd.DataFrame(simple_imputer_mean.transform(df_tst_dropped), columns=df_tst_dropped.columns)

simple_imputer_median = SimpleImputer(strategy='median')
df_simple_median_train = pd.DataFrame(simple_imputer_median.fit_transform(X_train[num_cols]), columns=X_train[num_cols].columns)
df_simple_median_val = pd.DataFrame(simple_imputer_median.transform(X_val[num_cols]), columns=X_val[num_cols].columns)
# df_simple_median_tst = pd.DataFrame(simple_imputer_median.transform(df_tst_dropped), columns=df_tst_dropped.columns)

# 2. KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
df_knn_imputed_train = pd.DataFrame(knn_imputer.fit_transform(X_train[num_cols]), columns=X_train[num_cols].columns)
df_knn_imputed_val = pd.DataFrame(knn_imputer.transform(X_val[num_cols]), columns=X_val[num_cols].columns)
# df_knn_imputed_tst = pd.DataFrame(knn_imputer.fit_transform(df_tst_dropped), columns=df_tst_dropped.columns)

# 4. Using pandas fillna (Mean, Median, Mode)
df_filled_mean_train = X_train[num_cols].fillna(X_train[num_cols].mean())
df_filled_median_train = X_train[num_cols].fillna(X_train[num_cols].median())
df_filled_mode_train = X_train[num_cols].fillna(X_train[num_cols].mode().iloc[0])

df_filled_mean_val = X_val[num_cols].fillna(X_val[num_cols].mean())
df_filled_median_val = X_val[num_cols].fillna(X_val[num_cols].median())
df_filled_mode_val = X_val.fillna(X_val[num_cols].mode().iloc[0])

# df_filled_mean_tst = df_tst_dropped.fillna(df_tst_dropped.mean())
# df_filled_median_tst = df_tst_dropped.fillna(df_tst_dropped.median())
# df_filled_mode_tst = df_tst_dropped.fillna(df_tst_dropped.mode().iloc[0])

iterative_imputer = IterativeImputer()
train_iterative_imputed = pd.DataFrame(iterative_imputer.fit_transform(X_train), columns=X_train.columns)

val_df_iterative_imputed = iterative_imputer.transform(X_val)
val_iterative_imputed = pd.DataFrame(iterative_imputer.fit_transform(X_val), columns=X_val.columns)

ValueError: could not convert string to float: 'Bin4'

In [None]:
dfs_to_test = [
    ('Mean', df_filled_mean_train, df_filled_mean_val),
    ('Median', df_filled_median_train, df_filled_median_val),
    ('Mode', df_filled_mode_train, df_filled_mode_val),
    ('Simple Imputer Mean', df_simple_mean_train, df_simple_mean_val),
    ('Simple Imputer Median', df_simple_median_train, df_simple_median_val),
    ('KNN Imputer', df_knn_imputed_train, df_knn_imputed_val),
    ('Iterative Imputer', train_iterative_imputed, val_iterative_imputed),
]

xgb = XGBClassifier(random_state=42, enable_categorical=True)

f1_scores = {}

for name, df_train, df_val in dfs_to_test:
    xgb = XGBClassifier(random_state=42, enable_categorical='True')
    xgb.fit(df_train, y_train)
    
    y_pred = xgb.predict(df_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    f1_scores[name] = f1

for name, score in f1_scores.items():
    print(f"{name} Imputation: F1 Score = {score}")

In [None]:
import os

output_dir = "../../data/imputed/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

X_imputed = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

X_imputed.to_csv(output_dir + "df.csv", index=False)
y.to_csv(output_dir + 'y.csv', index=False)