In [9]:
import pandas as pd
import numpy as np

# Determine dimensions of one-hot encoded CTDC data

In [2]:
df = pd.read_excel("CTDC_global_synthetic_data_v2024.xlsx")

In [None]:
unique_years = df['yearOfRegistration'].nunique()
print(f"Number of unique years: {unique_years}")

categorical_cols = ['gender','ageBroad','citizenship','CountryOfExploitation','traffickMonths']
df_encoded = pd.get_dummies(df, columns=categorical_cols)
if 'yearOfRegistration' in df_encoded.columns:
    total_num_columns = df_encoded.drop('yearOfRegistration', axis=1).shape[1]
else:
    total_num_columns = df_encoded.shape[1]
print(f"Total number of variables after one-hot encoding (exc. year): {total_num_columns}")

total_num_samples = df_encoded.shape[0] * total_num_columns
num_samples_per_year = total_num_samples / unique_years
print(f"Total number of samples (values) per year: {num_samples_per_year}")

Number of unique years: 21
Total number of variables after one-hot encoding (exc. year): 158
Total number of samples (values) per year: 1795323.9047619049


# Generating simulated data (Bernoulli acc. to paper)

In [10]:
def generate_data(n_samples=5000):
    """
    Generate simulated data for time steps t=1,...,21.
    
    Returns:
        data: A dictionary where each key maps to a list corresponding to the attribute values over all time steps.
              The dictionary contains keys: "t", "X1", "X2", "X3", "X4", "X5", "X6", "X7".
    """
    # Define the time steps (t = 1, 2, ..., 21)
    time_steps = np.arange(1, 22)
    data = {
        "t": sum([list(np.repeat(t, n_samples)) for t in time_steps], start=[]),
        "X1": [], "X2": [], "X3": [],
        "X4": [], "X5": [], "X6": [], "X7": []
    }

    for t in time_steps:
        # Normalize time: u runs from 0 (t=1) to 1 (t=21)
        u = (t - 1) / 20.0

        # --- X1t: Baseline attribute (Bernoulli(0.2)) ---
        X1 = np.random.binomial(1, 0.2, n_samples)
        
        # --- X2t: Independent of X1 (Bernoulli(0.1)) ---
        X2 = np.random.binomial(1, 0.1, n_samples)
        
        # --- X3t: Highly dependent on X1 ---
        X3 = X1 * np.random.binomial(1, 0.9, n_samples)
        
        # --- X4t: Increasing dependence on X1 ---
        # p4 goes from 0.2 at t=1 (u=0) to 0.9 at t=21 (u=1)
        p4 = 0.2 + 0.7 * u
        X4 = X1 * np.random.binomial(1, p4, n_samples)
        
        # --- X5t: Decreasing dependence on X1 ---
        # p5 goes from 0.8 at t=1 (u=0) to 0.1 at t=21 (u=1)
        p5 = 0.8 - 0.7 * u
        X5 = X1 * np.random.binomial(1, p5, n_samples)
        
        # --- X6t: Shifting dependence on X1 ---
        # For t <= 10: linearly decrease from 0.7 (t=1) to 0.1 (t=10)
        # For t > 10: linearly increase from 0.1 (t=10) to 0.9 (t=21)
        if t <= 10:
            p6 = 0.7 - (t - 1) * ((0.7 - 0.1) / (10 - 1))
        else:
            p6 = 0.1 + (t - 10) * ((0.9 - 0.1) / (21 - 10))
        X6 = X1 * np.random.binomial(1, p6, n_samples)
        
        # --- X7t: Sudden surge at the final time step ---
        # For t < 21, X7 is independent (Bernoulli(0.1))
        # At t == 21, X7 is dependent on X1 (Bernoulli(0.9))
        if t < 21:
            X7 = np.random.binomial(1, 0.1, n_samples)
        else:  # t == 21
            X7 = X1 * np.random.binomial(1, 0.9, n_samples)
        
        # Append the generated data for each attribute
        data["X1"] += list(X1)
        data["X2"] += list(X2)
        data["X3"] += list(X3)
        data["X4"] += list(X4)
        data["X5"] += list(X5)
        data["X6"] += list(X6)
        data["X7"] += list(X7)

    return data

In [None]:
data = generate_data(n_samples=5000)
df = pd.DataFrame.from_dict(data)
df.to_csv("Simulated Data/simple_gen_data_new.csv", index=False)