In [2]:
import numpy as np

In [40]:
def create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var):
    """Initialisation
        Args:
            n (int): Sample size.
            S (int): Underlying dimension.
            K (list of ints) : K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset 
            cat(int) : Number of categorical variables
            cat_idx (list of ints) : Indexes of the categorical variables 
            nb_of_cat_per_var (list of ints) : Number of categories for each categorical variable
    """

    #create S independant variables, drawn from a standard gaussian distribution
    data = np.random.normal(size = (S,n))

    nb_variables = S + sum(K) #number of variables expected in the final dataset
    data_shape = (nb_variables,n)

    #replicate each variable s (s in {1,...,S}) K_s times , to create correlated covariables:
    for s in range(S):
        for k in range(K[s]):
            data = np.vstack((data, data[s]))

    #add noise:
    mean_noise = 0
    std_noise = 0.05
    gaussian_noise = np.random.normal(mean_noise, std_noise, data_shape)
    data = data + gaussian_noise

    #create categorical variables :
    for i in range(cat): 
        idx_var = cat_idx[i] #index de la ième variable categorielle
        nb_of_cat = nb_of_cat_per_var[i] #nombre de categories différentes dans la ième variable categorielle
        data_i = data[idx_var] #selectionne la 

        #diviser data_i en nb_of_cat catégories différentes :
        #la méthode de division en catégories peut différer d'un datset a l'autre
        indices = np.arange(n)
        np.random.shuffle(indices)
        indices_cat = np.array_split(indices, nb_of_cat)

        for j in range(nb_of_cat):
            for ind in indices_cat[j]:
                data_i[ind] = j

        data[idx_var] = data_i
    
    return data

In [47]:
#Exemple de dataset créé :

n = 10     #sample size
S = 2      # underlying dimension
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [2,4] #number of categories for each categorical variable

create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var)

array([[-0.07479953, -0.22979304,  1.10500241, -0.7108198 , -0.18892163,
         0.65679121,  0.52724767, -0.06101519,  0.89301813, -1.11017719],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ],
       [ 1.        ,  1.        ,  2.        ,  0.        ,  2.        ,
         0.        ,  3.        ,  3.        ,  1.        ,  0.        ],
       [ 0.90762068,  1.74632323,  0.94586346, -1.60094748, -0.19271921,
        -0.10991529, -0.20221766,  0.97544696, -0.18841348, -1.48358551],
       [ 0.92680497,  1.71921508,  1.01001975, -1.65606633, -0.15967713,
        -0.1112321 , -0.09532762,  1.02824   , -0.20394305, -1.47834362],
       [ 0.92671725,  1.73979733,  0.84963376, -1.58409169, -0.043857  ,
        -0.06129569, -0.19635403,  1.02114926, -0.18343495, -1.47747779]])

In [45]:
n = 100

In [46]:
#Paramètres du premier dataset créé dans le papier : (3.1 Relationships between continuous and categorical variables)

S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

In [None]:
#Paramètres du deuxième dataset créé dans le papier : (3.2.1 Linear and nonlinear relationships)

S = 1   
K = [5]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 3 #number of categorical variables
cat_idx = [3,4,5] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable