In [None]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.impute import KNNImputer
from tqdm.notebook import tqdm 

# One-hot encoding

In [3]:
# ----- INPUTS AND OUTPUTS -----
input_file = 'Data/CTDC data/CTDC_default_filled.csv'
output_file = 'Data/CTDC data/CTDC_default_filled_one_hot.csv'

# ----- READ THE DATA -----
df = pd.read_csv(input_file)

# ----- DEFINE THE CATEGORIES -----
gender_categories = ["Man", "Woman", "Trans/Transgender/NonConforming"]
ageBroad_categories = ["0--8", "9--17", "18--20", "21--23", "24--26", "27--29", "30--38", "39--47", "48+"]
traffickMonths_categories = ["0--12 (0-1 yr)", "13--24 (1-2 yrs)", "25+ (2+ yrs)"]

# Convert columns to categorical with the predefined categories
df['gender'] = df['gender'].astype(CategoricalDtype(categories=gender_categories, ordered=False))
df['ageBroad'] = df['ageBroad'].astype(CategoricalDtype(categories=ageBroad_categories, ordered=False))
df['traffickMonths'] = df['traffickMonths'].astype(CategoricalDtype(categories=traffickMonths_categories, ordered=False))

# ----- UPDATED ONE-HOT ENCODING FUNCTION -----
def one_hot_encode(series, prefix, full_categories=None):
    """
    One-hot encode a pandas Series. If full_categories is provided, reindex to include
    all expected dummy columns. For rows where the original series is NaN, set all dummy
    values to NaN. Convert non-missing values to a nullable integer type.
    """
    dummies = pd.get_dummies(series, prefix=prefix)
    if full_categories is not None:
        expected_cols = [f"{prefix}_{cat}" for cat in full_categories]
        dummies = dummies.reindex(columns=expected_cols, fill_value=0)
    # For rows where the original series is NaN, set all dummy columns to np.nan.
    dummies.loc[series.isna(), :] = np.nan
    # Convert to a nullable integer type so that non-missing values are integers and missing remain NaN.
    dummies = dummies.astype("Int64")
    return dummies

# ----- GET UNIQUE CATEGORIES FOR 'citizenship' & 'CountryOfExploitation' -----
# Sorted list of only those that actually occur in the dataset.
citizenship_categories = sorted(df['citizenship'].dropna().unique())
coe_categories = sorted(df['CountryOfExploitation'].dropna().unique())

# ----- ONE-HOT ENCODE THE SPECIFIED COLUMNS -----
gender_dummies = one_hot_encode(df['gender'], 'gender', full_categories=gender_categories)
ageBroad_dummies = one_hot_encode(df['ageBroad'], 'ageBroad', full_categories=ageBroad_categories)
traffickMonths_dummies = one_hot_encode(df['traffickMonths'], 'traffickMonths', full_categories=traffickMonths_categories)
citizenship_dummies = one_hot_encode(df['citizenship'], 'citizenship', full_categories=citizenship_categories)
coe_dummies = one_hot_encode(df['CountryOfExploitation'], 'CountryOfExploitation', full_categories=coe_categories)

# ----- DROP ORIGINAL COLUMNS THAT WERE ONE-HOT ENCODED -----
df = df.drop(columns=['gender', 'ageBroad', 'traffickMonths', 'citizenship', 'CountryOfExploitation'])

# ----- CONCATENATE THE DUMMY COLUMNS -----
df_encoded = pd.concat([
    df,
    gender_dummies,
    ageBroad_dummies,
    traffickMonths_dummies,
    citizenship_dummies,
    coe_dummies
], axis=1)

# ----- PRINT DF SHAPE AND VARIABLE NAMING CONVENTIONS -----
print(f"Original df shape: {df.shape}")
print(f"One-hot encoded df shape: {df_encoded.shape}")

print("Variable Naming Conventions Used:")
print("New columns follow the format: <originalVariable>_<value>")
print("   - For 'gender':")
print("       'gender_Man', 'gender_Woman', 'gender_Trans/Transgender/NonConforming'")
print("   - For 'ageBroad':")
print("       'ageBroad_0--8', 'ageBroad_9--17', 'ageBroad_18--20', 'ageBroad_21--23',")
print("       'ageBroad_24--26', 'ageBroad_27--29', 'ageBroad_30--38', 'ageBroad_39--47', 'ageBroad_48+'")
print("   - For 'traffickMonths':")
print("       'traffickMonths_0--12 (0-1 yr)', 'traffickMonths_13--24 (1-2 yrs)', 'traffickMonths_25+ (2+ yrs)'")
print("   - For 'citizenship':")
print("       Columns are named 'citizenship_<ObservedCountryCode>'")
print("   - For 'CountryOfExploitation':")
print("       Columns are named 'CountryOfExploitation_<ObservedCountryCode>'")
print("\n*Note:* For any cell that is empty in the original data, all corresponding one-hot columns are set to NaN.")

num_zeros = (df_encoded == 0).sum().sum()
num_ones = (df_encoded == 1).sum().sum()
num_nans = df_encoded.isna().sum().sum()

print("\nCell Counts Across the Entire Sheet:")
print(f"Number of cells with 0: {num_zeros}")
print(f"Number of cells with 1: {num_ones}")
print(f"Number of cells with NaN: {num_nans}")

# ----- SAVE THE NEW DATAFRAME TO A CSV FILE -----
df_encoded.to_csv(output_file, index=False)
print(f"\nData saved successfully to '{output_file}'.")

  df = pd.read_csv(input_file)
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(), :] = np.nan
  dummies.loc[series.isna(

Original df shape: (238619, 22)
One-hot encoded df shape: (238619, 159)
Variable Naming Conventions Used:
New columns follow the format: <originalVariable>_<value>
   - For 'gender':
       'gender_Man', 'gender_Woman', 'gender_Trans/Transgender/NonConforming'
   - For 'ageBroad':
       'ageBroad_0--8', 'ageBroad_9--17', 'ageBroad_18--20', 'ageBroad_21--23',
       'ageBroad_24--26', 'ageBroad_27--29', 'ageBroad_30--38', 'ageBroad_39--47', 'ageBroad_48+'
   - For 'traffickMonths':
       'traffickMonths_0--12 (0-1 yr)', 'traffickMonths_13--24 (1-2 yrs)', 'traffickMonths_25+ (2+ yrs)'
   - For 'citizenship':
       Columns are named 'citizenship_<ObservedCountryCode>'
   - For 'CountryOfExploitation':
       Columns are named 'CountryOfExploitation_<ObservedCountryCode>'

*Note:* For any cell that is empty in the original data, all corresponding one-hot columns are set to NaN.

Cell Counts Across the Entire Sheet:
Number of cells with 0: 21206451
Number of cells with 1: 1158915
Number 

In [4]:
# Print the distribution of 'yearOfRegistration' including NaN
print("\nNumber of each year in 'yearOfRegistration' (including NaN):")
print(df_encoded['yearOfRegistration'].value_counts(dropna=False))


Number of each year in 'yearOfRegistration' (including NaN):
yearOfRegistration
NaN       32037
2016.0    32025
2019.0    29850
2020.0    25070
2018.0    20011
2021.0    18591
2017.0    16534
2022.0    15338
2015.0     8934
2014.0     5986
2011.0     4116
2013.0     4090
2005.0     3980
2010.0     3730
2006.0     3304
2007.0     3124
2012.0     2903
2008.0     2385
2009.0     2178
2002.0     1838
2004.0     1630
2003.0      965
Name: count, dtype: int64


# Imputation (kNN, k = 2 to 6, 7 subsamples)

In [6]:
input_file = "Data/CTDC data/CTDC_default_filled_one_hot.csv"
df = pd.read_csv(input_file)

# Define columns to impute (all except "yearOfRegistration")
cols_to_impute = [col for col in df.columns if col != "yearOfRegistration"]

['meansDebtBondageEarnings', 'meansThreats', 'meansAbusePsyPhySex', 'meansFalsePromises', 'meansDrugsAlcohol', 'meansDenyBasicNeeds', 'meansExcessiveWorkHours', 'meansWithholdDocs', 'isForcedLabour', 'isSexualExploit', 'isOtherExploit', 'typeOfLabourAgriculture', 'typeOfLabourConstruction', 'typeOfLabourDomesticWork', 'typeOfLabourHospitality', 'typeOfSexProstitution', 'typeOfSexPornography', 'recruiterRelationIntimatePartner', 'recruiterRelationFriend', 'recruiterRelationFamily', 'recruiterRelationOther', 'gender_Man', 'gender_Woman', 'gender_Trans/Transgender/NonConforming', 'ageBroad_0--8', 'ageBroad_9--17', 'ageBroad_18--20', 'ageBroad_21--23', 'ageBroad_24--26', 'ageBroad_27--29', 'ageBroad_30--38', 'ageBroad_39--47', 'ageBroad_48+', 'traffickMonths_0--12 (0-1 yr)', 'traffickMonths_13--24 (1-2 yrs)', 'traffickMonths_25+ (2+ yrs)', 'citizenship_AFG', 'citizenship_ALB', 'citizenship_BDI', 'citizenship_BGD', 'citizenship_BGR', 'citizenship_BLR', 'citizenship_BOL', 'citizenship_BRA', 

In [None]:
overall_output_file = "Data/CTDC data/CTDC_kNNavg7k26_impute.csv"

# Number of subsamples and range of k-values
n_subsamples = 7
k_values = range(2, 7)

# List to hold the averaged imputed data (only for imputed columns) for each subsample
subsample_imputed_list = []

# Outer loop: iterate over subsamples
for i in tqdm(range(n_subsamples), desc="Processing subsamples"):
    # Stratified sampling by 'yearOfRegistration' (including NaN as its own category)
    subsample = df.groupby('yearOfRegistration', dropna=False, group_keys=False).apply(
        lambda x: x.sample(frac=1, replace=True)
    )
    
    # List to hold imputed data for each k within the current subsample
    k_imputations = []
    
    # Inner loop: iterate over each k-value
    for k in tqdm(k_values, desc="Processing k-values", leave=False):
        imputer = KNNImputer(n_neighbors=k)
        # Fit the imputer on the subsample
        imputer.fit(subsample[cols_to_impute])
        # Impute the entire dataset
        imputed_values = imputer.transform(df[cols_to_impute])
        k_imputations.append(imputed_values)
    
    # Average imputed data for the current subsample (averaging over k-values)
    avg_imputed_subsample = np.mean(np.stack(k_imputations, axis=0), axis=0)
    
    # Combine the imputed subset with the unchanged "yearOfRegistration" column
    df_imputed_avg = pd.concat(
        [df[["yearOfRegistration"]], pd.DataFrame(avg_imputed_subsample, columns=cols_to_impute)],
        axis=1
    )
    
    # Save this subsample average imputed data as a CSV file (for reference)
    subsample_output_file = f"Data/CTDC data/kNN temp/CTDC_kNNavg7k26_impute_sample{i+1}.csv"
    df_imputed_avg.to_csv(subsample_output_file, index=False)
    print(f"Saved subsample {i+1} imputed dataset to {subsample_output_file}")
    
    # Append the imputed (subset) array to our list
    subsample_imputed_list.append(avg_imputed_subsample)

# After processing all subsamples, average the 7 subsample averages (for imputed columns)
final_avg_imputed_data = np.mean(np.stack(subsample_imputed_list, axis=0), axis=0)
df_final_avg = pd.concat(
    [df[["yearOfRegistration"]], pd.DataFrame(final_avg_imputed_data, columns=cols_to_impute)],
    axis=1
)

# Sort the final averaged DataFrame by "yearOfRegistration" in decreasing order
df_final_avg = df_final_avg.sort_values("yearOfRegistration", ascending=False)

# Save the overall averaged imputed dataset (without binary thresholding)
df_final_avg.to_csv(overall_output_file, index=False)
print(f"Saved overall averaged imputed dataset to {overall_output_file}")

  subsample = df.groupby('yearOfRegistration', dropna=False, group_keys=False).apply(


KeyboardInterrupt: 

In [None]:
# ----- INPUTS AND OUTPUTS FOR BINARY ARGMAX THRESHOLDING -----
input_file = "Data/CTDC data/CTDC_kNNavg7k26_impute.csv"
output_file = "Data/CTDC data/CTDC_kNNavg7k26_impute_binary.csv"

# Read the previously saved imputed dataset
df_imputed = pd.read_csv(input_file)

# Define the variable groups (each group corresponds to columns that start with these prefixes)
groups = ["gender", "ageBroad", "traffickMonths", "citizenship", "CountryOfExploitation"]

# Process each group separately
for group in groups:
    # Identify the columns for the current group (e.g., "gender_Man", "gender_Woman", etc.)
    group_cols = [col for col in df_imputed.columns if col.startswith(group + "_")]
    
    # Compute the prevalence for each column in the group from the imputed data
    prevalence = {col: df_imputed[col].mean() for col in group_cols}
    
    # Define a function to process a row for the current group
    def process_row(row):
        values = row[group_cols]
        # If all values in the group are NaN, then leave them unchanged
        if values.isnull().all():
            return values
        # Find the maximum value in this row for the group
        max_val = values.max()
        # Find all candidate columns that equal the maximum value
        candidates = values[values == max_val].index.tolist()
        # Break tie: if multiple candidates, pick the one with the highest prevalence
        if len(candidates) > 1:
            chosen = max(candidates, key=lambda col: prevalence[col])
        else:
            chosen = candidates[0]
        # Create a new Series for this group with all zeros and set the chosen column to 1
        new_vals = pd.Series(0, index=group_cols)
        new_vals[chosen] = 1
        return new_vals
    
    # Apply the processing row-wise for this group and assign back to the dataframe
    df_imputed[group_cols] = df_imputed.apply(lambda row: process_row(row), axis=1)

# Save the final binary argmax-thresholded dataset
df_imputed.to_csv(output_file, index=False)
print(f"Saved binary argmax thresholded imputed dataset to {output_file}")