In [1]:
import os
import sys
import numpy as np
import pandas as pd
import itertools


In [2]:
project_root = os.path.abspath(os.getcwd())
sys.path.insert(0, project_root)

# Make sure you are in the package's main directory
print(project_root)

/root/Q3/DeepHalo-tf/Final/DeepHalo


# Synthetic Data Generation

In [3]:
np.random.seed(20)

In [4]:
# Generate synthetic data based on experimental setup in Section 5.1 of Zhang(25)
def generate_probability_list(binary_subset):
    indices_of_ones = [i for i, value in enumerate(binary_subset) if value == 1]
    if not indices_of_ones:
        return [0.0] * len(binary_subset)
    num_ones = len(indices_of_ones)
    probabilities_for_ones = np.random.dirichlet(np.ones(num_ones))
    probability_list = [0.0] * len(binary_subset)
    for i, index in enumerate(indices_of_ones):
        probability_list[index] = probabilities_for_ones[i]
    return probability_list

def generate_one_hot_batch(probabilities, num_samples):
    probabilities = np.array(probabilities)
    p_index = np.random.choice(len(probabilities), size=num_samples, p=probabilities)
    one_hot_batch = np.zeros((num_samples, len(probabilities)))
    one_hot_batch[np.arange(num_samples), p_index] = 1
    return one_hot_batch

def generate_data(p_offerset, p_max_size, p_min_size, N_train, N_test):
    X = []
    Y = []
    probability_lists = []
    binary_subsets = []

    for r in range(p_min_size, p_max_size + 1):
        for subset in itertools.combinations(p_offerset, r):
            binary_subset = [1 if x in subset else 0 for x in p_offerset]
            probability_list = generate_probability_list(binary_subset)
            probability_lists.append(probability_list)
            binary_subsets.append(binary_subset)

    Y_train = [generate_one_hot_batch(p, N_train) for p in probability_lists]
    Y_test = [generate_one_hot_batch(p, N_test) for p in probability_lists]

    Y_train = np.concatenate(Y_train, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    X_train = [subset for subset in binary_subsets for _ in range(N_train)]
    X_test = [subset for subset in binary_subsets for _ in range(N_test)]

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    return X_train, Y_train, X_test, Y_test



In [5]:
J_S_pairs = [(18, 15), (12, 4), (12, 8), (19, 16), (34, 32)]


N_train = 40
N_test = 10

for J, S in J_S_pairs:

    universal_set = [i for i in range(J)]
       
    X_train, Y_train, X_test, Y_test = generate_data(universal_set, S, S, N_train, N_test)
      
    dataset_train = np.hstack((X_train, Y_train))
    dataset_test = np.hstack((X_test, Y_test))
    
    columns = ['X' + str(i) for i in universal_set] + ['Y' + str(i) for i in universal_set]
    
    
    df_train = pd.DataFrame(dataset_train, columns=columns)
    df_test = pd.DataFrame(dataset_test, columns=columns)    
    
    # Save data
    file_save_path = os.path.join(project_root, 'Data')
    os.makedirs(file_save_path, exist_ok=True)
    df_train.to_csv(os.path.join(file_save_path, f'Synthetic_{J}-{S}-{N_train}_Train.csv'), index=False)
    df_test.to_csv(os.path.join(file_save_path, f'Synthetic_{J}-{S}-{N_test}_Test.csv'), index=False)
    
    print(f'==== Dataset (J,S)=({J},{S}): ====')
    print(X_train.shape)
    print(Y_train.shape)
    print(X_test.shape)
    print(Y_test.shape)

==== Dataset (J,S)=(18,15): ====
(32640, 18)
(32640, 18)
(8160, 18)
(8160, 18)
==== Dataset (J,S)=(12,4): ====
(19800, 12)
(19800, 12)
(4950, 12)
(4950, 12)
==== Dataset (J,S)=(12,8): ====
(19800, 12)
(19800, 12)
(4950, 12)
(4950, 12)
==== Dataset (J,S)=(19,16): ====
(38760, 19)
(38760, 19)
(9690, 19)
(9690, 19)
==== Dataset (J,S)=(34,32): ====
(22440, 34)
(22440, 34)
(5610, 34)
(5610, 34)
