In [9]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype

In [10]:
# ----- INPUTS AND OUTPUTS -----
input_file = 'Data/CTDC data/CTDC_default_filled.csv'
output_file = 'Data/CTDC data/CTDC_default_filled_one_hot.csv'

# ----- READ THE DATA -----
df = pd.read_csv(input_file)

# ----- DEFINE THE CATEGORIES -----
gender_categories = ["Man", "Woman", "Trans/Transgender/NonConforming"]
ageBroad_categories = ["0--8", "9--17", "18--20", "21--23", "24--26", "27--29", "30--38", "39--47", "48+"]
traffickMonths_categories = ["0--12 (0-1 yr)", "13--24 (1-2 yrs)", "25+ (2+ yrs)"]

# Convert columns to categorical with the predefined categories
df['gender'] = df['gender'].astype(CategoricalDtype(categories=gender_categories, ordered=False))
df['ageBroad'] = df['ageBroad'].astype(CategoricalDtype(categories=ageBroad_categories, ordered=False))
df['traffickMonths'] = df['traffickMonths'].astype(CategoricalDtype(categories=traffickMonths_categories, ordered=False))

# ----- ONE-HOT ENCODING FUNCTION -----
def one_hot_encode(series, prefix, full_categories=None):
    """
    One-hot encode a pandas Series. If full_categories is provided, reindex to include
    all expected dummy columns. Then fill missing values with 0 and convert to int.
    """
    dummies = pd.get_dummies(series, prefix=prefix)
    if full_categories is not None:
        expected_cols = [f"{prefix}_{cat}" for cat in full_categories]
        dummies = dummies.reindex(columns=expected_cols, fill_value=0)
    dummies = dummies.fillna(0).astype(int)
    return dummies

# ----- GET UNIQUE CATEGORIES FOR 'citizenship' & 'CountryOfExploitation' -----
# Sorted list of only those that actually occur in the dataset.
citizenship_categories = sorted(df['citizenship'].dropna().unique())
coe_categories = sorted(df['CountryOfExploitation'].dropna().unique())

# ----- ONE-HOT ENCODE THE SPECIFIED COLUMNS -----
gender_dummies = one_hot_encode(df['gender'], 'gender', full_categories=gender_categories)
ageBroad_dummies = one_hot_encode(df['ageBroad'], 'ageBroad', full_categories=ageBroad_categories)
traffickMonths_dummies = one_hot_encode(df['traffickMonths'], 'traffickMonths', full_categories=traffickMonths_categories)
citizenship_dummies = one_hot_encode(df['citizenship'], 'citizenship', full_categories=citizenship_categories)
coe_dummies = one_hot_encode(df['CountryOfExploitation'], 'CountryOfExploitation', full_categories=coe_categories)

# ----- DROP ORIGINAL COLUMNS THAT WERE ONE-HOT ENCODED -----
df = df.drop(columns=['gender', 'ageBroad', 'traffickMonths', 'citizenship', 'CountryOfExploitation'])

# ----- CONCATENATE THE DUMMY COLUMNS -----
df_encoded = pd.concat([
    df,
    gender_dummies,
    ageBroad_dummies,
    traffickMonths_dummies,
    citizenship_dummies,
    coe_dummies
], axis=1)

# ----- PRINT DF SHAPE AND VARIABLE NAMING CONVENTIONS -----
print(f"Original df shape: {df.shape}")
print(f"One-hot encoded df shape: {df_encoded.shape}")

print("Variable Naming Conventions Used:")
print("New columns follow the format: <originalVariable>_<value>")
print("   - For 'gender':")
print("       'gender_Man', 'gender_Woman', 'gender_Trans/Transgender/NonConforming'")
print("   - For 'ageBroad':")
print("       'ageBroad_0--8', 'ageBroad_9--17', 'ageBroad_18--20', 'ageBroad_21--23',")
print("       'ageBroad_24--26', 'ageBroad_27--29', 'ageBroad_30--38', 'ageBroad_39--47', 'ageBroad_48+'")
print("   - For 'traffickMonths':")
print("       'traffickMonths_0--12 (0-1 yr)', 'traffickMonths_13--24 (1-2 yrs)', 'traffickMonths_25+ (2+ yrs)'")
print("   - For 'citizenship':")
print("       Columns are named 'citizenship_<ObservedCountryCode>'")
print("   - For 'CountryOfExploitation':")
print("       Columns are named 'CountryOfExploitation_<ObservedCountryCode>'")
print("\n*Note:* For any cell that is empty in the original data, all corresponding one-hot columns remain 0.")

num_zeros = (df_encoded == 0).sum().sum()
num_ones = (df_encoded == 1).sum().sum()
num_nans = df_encoded.isna().sum().sum()

print("\nCell Counts Across the Entire Sheet:")
print(f"Number of cells with 0: {num_zeros}")
print(f"Number of cells with 1: {num_ones}")
print(f"Number of cells with NaN: {num_nans}")

# ----- SAVE THE NEW DATAFRAME TO A CSV FILE -----
df_encoded.to_csv(output_file, index=False)
print(f"\nData saved successfully to '{output_file}'.")

  df = pd.read_csv(input_file)


Original df shape: (238619, 22)
One-hot encoded df shape: (238619, 159)
Variable Naming Conventions Used:
New columns follow the format: <originalVariable>_<value>
   - For 'gender':
       'gender_Man', 'gender_Woman', 'gender_Trans/Transgender/NonConforming'
   - For 'ageBroad':
       'ageBroad_0--8', 'ageBroad_9--17', 'ageBroad_18--20', 'ageBroad_21--23',
       'ageBroad_24--26', 'ageBroad_27--29', 'ageBroad_30--38', 'ageBroad_39--47', 'ageBroad_48+'
   - For 'traffickMonths':
       'traffickMonths_0--12 (0-1 yr)', 'traffickMonths_13--24 (1-2 yrs)', 'traffickMonths_25+ (2+ yrs)'
   - For 'citizenship':
       Columns are named 'citizenship_<ObservedCountryCode>'
   - For 'CountryOfExploitation':
       Columns are named 'CountryOfExploitation_<ObservedCountryCode>'

*Note:* For any cell that is empty in the original data, all corresponding one-hot columns remain 0.

Cell Counts Across the Entire Sheet:
Number of cells with 0: 33384349
Number of cells with 1: 1158915
Number of cel

In [11]:
# Print the distribution of 'yearOfRegistration' including NaN
print("\nNumber of each year in 'yearOfRegistration' (including NaN):")
print(df_encoded['yearOfRegistration'].value_counts(dropna=False))


Number of each year in 'yearOfRegistration' (including NaN):
yearOfRegistration
NaN       32037
2016.0    32025
2019.0    29850
2020.0    25070
2018.0    20011
2021.0    18591
2017.0    16534
2022.0    15338
2015.0     8934
2014.0     5986
2011.0     4116
2013.0     4090
2005.0     3980
2010.0     3730
2006.0     3304
2007.0     3124
2012.0     2903
2008.0     2385
2009.0     2178
2002.0     1838
2004.0     1630
2003.0      965
Name: count, dtype: int64
