In [7]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.impute import KNNImputer
from tqdm.notebook import tqdm

## Test

In [2]:
# ----- PARAMETERS -----
remove_percent = 99
keep_frac = 1 - remove_percent / 100
input_file = "Data/CTDC data/CTDC_default_filled_one_hot.csv"
output_file = "Data/CTDC data/CTDC_default_filled_one_hot_test.csv"

# ----- READ CSV -----
df_test = pd.read_csv(input_file)

# ----- GROUPING AND SAMPLING -----
# Group by 'yearOfRegistration'
df_reduced = df_test.groupby('yearOfRegistration', dropna=False, group_keys=False).apply(
    lambda group: group.sample(frac=keep_frac, random_state=42)
)

# ----- SAVE THE RESULT -----
df_reduced.to_csv(output_file, index=False)
print(f"Saved reduced CSV with {remove_percent}% rows removed per group to {output_file}")


Saved reduced CSV with 99% rows removed per group to Data/CTDC data/CTDC_default_filled_one_hot_test.csv


  df_reduced = df_test.groupby('yearOfRegistration', dropna=False, group_keys=False).apply(


## Main

In [None]:
input_file = "Data/CTDC data/CTDC_default_filled_one_hot.csv"
df = pd.read_csv(input_file)

# Define columns to impute (all except "yearOfRegistration")
cols_to_impute = [col for col in df.columns if col != "yearOfRegistration"]

In [None]:
# Number of subsamples & k-values
n_subsamples = 0  # set to 0 for no subsampling, > 0 for bootstrapping
k_values = [3, 5, 7]
k_values_str = ",".join(map(str, k_values))

# Templates for saving outputs
partial_subsample_template = (
    "Data/CTDC data/kNN temp/CTDC_kNNavg{n_subsamples}k{k_values_str}_impute_sample{i}.csv"
)
per_k_template_if_no_subsamples = (
    "Data/CTDC data/kNN temp/CTDC_kNNavgk{k_values_str}_impute_k{k}.csv"
)

# Final binary output file after binary argmax thresholding
overall_binary_output_file = f"Data/CTDC data/CTDC_kNNavg{n_subsamples}k{k_values_str}_impute_final.csv"

# ---------------------------
# KNN IMPUTATION
# ---------------------------
if n_subsamples == 0:
    # --- NO SUBSAMPLING ---
    print("No subsamples -> Imputing the full dataset for each k, saving separate CSVs.")
    
    # List to hold the imputed arrays for each k
    all_imputations = []
    
    for k in tqdm(k_values, desc="k-values"):
        imputer = KNNImputer(n_neighbors=k)
        imputer.fit(df[cols_to_impute])
        imputed_values = imputer.transform(df[cols_to_impute])
        all_imputations.append(imputed_values)
        
        # Merge imputed columns with the original 'yearOfRegistration'
        df_imputed = pd.concat(
            [df[["yearOfRegistration"]], 
             pd.DataFrame(imputed_values, columns=cols_to_impute)],
            axis=1
        )
        df_imputed = df_imputed.sort_values("yearOfRegistration", ascending=False)
        
        # Save per-k imputed dataset
        k_output_file = per_k_template_if_no_subsamples.format(
            k_values_str=k_values_str, k=k
        )
        df_imputed.to_csv(k_output_file, index=False)
        print(f"Saved imputed dataset for k={k} -> {k_output_file}")

    # Overall average across the k values
    avg_imputed = np.mean(np.stack(all_imputations, axis=0), axis=0)
    df_final = pd.concat(
        [df[["yearOfRegistration"]],
         pd.DataFrame(avg_imputed, columns=cols_to_impute)],
        axis=1
    )
    df_final = df_final.sort_values("yearOfRegistration", ascending=False)

else:
    # --- WITH SUBSAMPLING ---
    print(f"Performing {n_subsamples} subsamples (bootstrap) with k-values {k_values_str}.")
    subsample_imputed_list = []

    for i in tqdm(range(1, n_subsamples + 1), desc="Subsamples"):
        # Stratified subsampling by 'yearOfRegistration'
        subsample = df.groupby('yearOfRegistration', dropna=False, group_keys=False).apply(
            lambda x: x.sample(frac=1, replace=True)
        )
        
        # For each subsample, collect imputations for every k value
        k_imputations = []
        for k in tqdm(k_values, desc=f"Subsample {i} k-values", leave=False):
            imputer = KNNImputer(n_neighbors=k)
            imputer.fit(subsample[cols_to_impute])
            # Note: Imputation is applied on the full dataset even though imputer was fit on the subsample
            imputed_values = imputer.transform(df[cols_to_impute])
            k_imputations.append(imputed_values)
        
        # Average the imputed results for the current subsample across k values
        avg_imputed_subsample = np.mean(np.stack(k_imputations, axis=0), axis=0)
        df_sub_imputed = pd.concat(
            [df[["yearOfRegistration"]],
             pd.DataFrame(avg_imputed_subsample, columns=cols_to_impute)],
            axis=1
        )
        df_sub_imputed = df_sub_imputed.sort_values("yearOfRegistration", ascending=False)
        
        # Save the partial result for the subsample for inspection
        subsample_output_file = partial_subsample_template.format(
            n_subsamples=n_subsamples, k_values_str=k_values_str, i=i
        )
        df_sub_imputed.to_csv(subsample_output_file, index=False)
        print(f"Saved partial subsample {i} -> {subsample_output_file}")
        
        subsample_imputed_list.append(avg_imputed_subsample)
    
    # Final overall average across all subsamples
    final_avg_imputed_data = np.mean(np.stack(subsample_imputed_list, axis=0), axis=0)
    df_final = pd.concat(
        [df[["yearOfRegistration"]],
         pd.DataFrame(final_avg_imputed_data, columns=cols_to_impute)],
        axis=1
    )
    df_final = df_final.sort_values("yearOfRegistration", ascending=False)

# ---------------------------
# BINARY ARGMAX THRESHOLDING FOR SELECTED GROUPS
# ---------------------------
# Define the groups (columns starting with these prefixes will be processed using argmax logic)
groups = ["gender", "ageBroad", "traffickMonths", "citizenship", "CountryOfExploitation"]

for group in groups:
    # Identify columns for the current group (e.g. "gender_Man", "gender_Woman", etc.)
    group_cols = [col for col in df_final.columns if col.startswith(group + "_")]
    
    # Compute "prevalence" as the mode for each column in the group
    prevalence = {}
    for col in group_cols:
        mode_series = df_final[col].mode()
        prevalence[col] = mode_series.iloc[0] if not mode_series.empty else 0
    
    # Function to process one row: select the candidate with the maximum value, 
    # and if tied, choose the one with the highest (mode) prevalence
    def process_row(row):
        values = row[group_cols]
        if values.isnull().all():
            return values
        max_val = values.max()
        candidates = values[values == max_val].index.tolist()
        chosen = max(candidates, key=lambda col: prevalence[col]) if len(candidates) > 1 else candidates[0]
        # Create a new Series: set the chosen column to 1 and the rest to 0.
        new_vals = pd.Series(0, index=group_cols)
        new_vals[chosen] = 1
        return new_vals
    
    df_final[group_cols] = df_final.apply(lambda row: process_row(row), axis=1)

# ---------------------------
# BINARY THRESHOLDING FOR REMAINING COLUMNS
# ---------------------------
# Identify all remaining columns (excluding "yearOfRegistration" and the processed group columns)
remaining_cols = [col for col in df_final.columns 
                  if col != "yearOfRegistration" 
                  and not any(col.startswith(g + "_") for g in groups)]

# Process each remaining column individually
for col in remaining_cols:
    mode_series = df_final[col].mode()
    col_mode = mode_series.iloc[0] if not mode_series.empty else 0
    def threshold_func(x):
        if x < 0.5:
            return 0
        elif x > 0.5:
            return 1
        else:
            return col_mode
    df_final[col] = df_final[col].apply(threshold_func)

# ---------------------------
# SAVE FINAL BINARY OUTPUT
# ---------------------------
df_final.to_csv(overall_binary_output_file, index=False)
print(f"Saved final binary argmax thresholded imputed dataset -> {overall_binary_output_file}")

No subsamples -> Imputing the full dataset for each k, saving separate CSVs.


k-values:   0%|          | 0/3 [00:00<?, ?it/s]

Saved imputed dataset for k=3 -> Data/CTDC data/kNN temp/CTDC_kNNavgk3,5,7_impute_k3_test.csv
Saved imputed dataset for k=5 -> Data/CTDC data/kNN temp/CTDC_kNNavgk3,5,7_impute_k5_test.csv
Saved imputed dataset for k=7 -> Data/CTDC data/kNN temp/CTDC_kNNavgk3,5,7_impute_k7_test.csv
Saved final binary argmax thresholded imputed dataset -> Data/CTDC data/CTDC_kNNavg0k3,5,7_impute_final_test.csv
