In [1]:
import pandas as pd
import numpy as np
import random

In [3]:
# First calculate % missing data from CTDA data by variables / variable groups
ctda_df = pd.read_excel("CTDC_global_synthetic_data_v2024.xlsx")

single_vars = [
    "gender",
    "ageBroad",
    "traffickMonths"
]

group_vars = {
    "countryGroup": [
        "citizenship",
        "CountryOfExploitation"
    ],
    "meansGroup": [
        "meansDebtBondageEarnings",
        "meansThreats",
        "meansAbusePsyPhySex",
        "meansFalsePromises",
        "meansDrugsAlcohol",
        "meansDenyBasicNeeds",
        "meansExcessiveWorkHours",
        "meansWithholdDocs"
    ],
    "exploitGroup": [
        "isForcedLabour",
        "isSexualExploit",
        "isOtherExploit",
        "typeOfLabourAgriculture",
        "typeOfLabourConstruction",
        "typeOfLabourDomesticWork",
        "typeOfLabourHospitality",
        "typeOfSexProstitution",
        "typeOfSexPornography"
    ],
    "recruiterGroup": [
        "recruiterRelationIntimatePartner",
        "recruiterRelationFriend",
        "recruiterRelationFamily",
        "recruiterRelationOther"
    ]
}

# Function to calculate percent missing for a single column
def percent_missing_single(col_name):
    return ctda_df[col_name].isna().mean() * 100

# Calculate percent missing for a group of columns
def percent_missing_group(col_names):
    return ctda_df[col_names].isna().all(axis=1).mean() * 100

print("=== Single Variables ===")
for var in single_vars:
    pm = percent_missing_single(var)
    print(f"{var}: {pm:.2f}% missing")

print("\n=== Variable Groups ===")
for group_name, cols in group_vars.items():
    pm = percent_missing_group(cols)
    print(f"{group_name}: {pm:.2f}% missing")

=== Single Variables ===
gender: 20.15% missing
ageBroad: 47.49% missing
traffickMonths: 90.72% missing

=== Variable Groups ===
countryGroup: 6.33% missing
meansGroup: 61.85% missing
exploitGroup: 31.66% missing
recruiterGroup: 74.14% missing


In [5]:
# Apply these missing % randomly to simulated dataset
input_file = "Simulated Data/simple_gen_data_new.csv"
output_file = "Simulated Data/simple_gen_missing_data_new.csv"

df = pd.read_csv(input_file)

prob_list = [0.2015, 0.4749, 0.9072, 0.0633, 0.6185, 0.3166, 0.7414]
random.shuffle(prob_list)

columns = ["X1", "X2", "X3", "X4", "X5", "X6", "X7"]

for col, p in zip(columns, prob_list):
    # Generate random values
    random_values = np.random.rand(len(df))
    # Where random value < p, set that cell to NaN
    df.loc[random_values < p, col] = np.nan
    print(f'Variable {col} has probability {p} of missing values')

df.to_csv(output_file, index=False, na_rep='')

Variable X1 has probability 0.6185 of missing values
Variable X2 has probability 0.4749 of missing values
Variable X3 has probability 0.7414 of missing values
Variable X4 has probability 0.3166 of missing values
Variable X5 has probability 0.0633 of missing values
Variable X6 has probability 0.2015 of missing values
Variable X7 has probability 0.9072 of missing values


In [6]:
# Check percentage of 0's and 1's in simulated data by variable
input_file = "Simulated Data/simple_gen_data_new.csv"
df = pd.read_csv(input_file)

vars = df.columns[1:]

for var in vars:
    # Get the distribution of non-missing values
    freq_table = df[var].value_counts(dropna=True)
    
    # Distinct values and their probabilities
    distinct_vals = freq_table.index.to_list()
    probabilities = (freq_table / freq_table.sum()).to_list()

    print(f'Variable {var}: values {distinct_vals} probability {probabilities}')

Variable X1: values [0, 1] probability [0.8017142857142857, 0.1982857142857143]
Variable X2: values [0, 1] probability [0.8992857142857142, 0.10071428571428571]
Variable X3: values [0, 1] probability [0.8210761904761905, 0.1789238095238095]
Variable X4: values [0, 1] probability [0.8909238095238096, 0.10907619047619048]
Variable X5: values [0, 1] probability [0.9106952380952381, 0.0893047619047619]
Variable X6: values [0, 1] probability [0.9067714285714286, 0.09322857142857142]
Variable X7: values [0, 1] probability [0.8985809523809524, 0.10141904761904762]
