In [11]:
import numpy as np
import pandas as pd

In [27]:
def create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var):
    """Initialisation
        Args:
            n (int): Sample size.
            S (int): Underlying dimension.
            K (list of ints) : K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset 
            cat(int) : Number of categorical variables
            cat_idx (list of ints) : Indexes of the categorical variables 
            nb_of_cat_per_var (list of ints) : Number of categories for each categorical variable
    """

    #create S independant variables, drawn from a standard gaussian distribution
    data = np.random.normal(size = (S,n))

    nb_variables = S + sum(K) #number of variables expected in the final dataset
    data_shape = (nb_variables,n)

    #replicate each variable s (s in {1,...,S}) K_s times , to create correlated covariables:
    for s in range(S):
        for k in range(K[s]):
            data = np.vstack((data, data[s]))

    #add noise:
    mean_noise = 0
    std_noise = 0.05
    gaussian_noise = np.random.normal(mean_noise, std_noise, data_shape)
    data = data + gaussian_noise

    #create categorical variables :
    for i in range(cat): 
        idx_var = cat_idx[i] #index de la ième variable categorielle
        nb_of_cat = nb_of_cat_per_var[i] #nombre de categories différentes dans la ième variable categorielle
        data_i = data[idx_var] #selectionne la 

        #diviser data_i en nb_of_cat catégories différentes :
        #la méthode de division en catégories peut différer d'un datset a l'autre
        indices = np.arange(n)
        np.random.shuffle(indices)
        indices_cat = np.array_split(indices, nb_of_cat)

        for j in range(nb_of_cat):
            for ind in indices_cat[j]:
                data_i[ind] = j

        data[idx_var] = data_i
    
    return pd.DataFrame(data.T)

#mettre les paramètres de bruit comme paramètres de la fonction.

In [28]:
#Exemple de dataset créé :

n = 10     #sample size
S = 2      # underlying dimension
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [2,4] #number of categories for each categorical variable

create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var)

Unnamed: 0,0,1,2,3,4,5
0,1.578489,1.0,0.0,0.761421,0.725775,0.756339
1,0.802322,1.0,2.0,-1.933007,-2.001259,-1.981379
2,-0.691929,0.0,3.0,-0.440502,-0.446958,-0.437577
3,0.670569,1.0,1.0,0.208185,0.206388,0.193348
4,0.124458,0.0,3.0,-1.428874,-1.449524,-1.385826
5,0.97386,1.0,2.0,1.883727,1.764825,1.773853
6,0.50795,0.0,1.0,-0.599775,-0.61306,-0.641193
7,0.724019,1.0,1.0,0.952758,1.015587,0.961233
8,0.177341,0.0,0.0,-2.616292,-2.557722,-2.736096
9,-1.648606,0.0,0.0,-1.510652,-1.567617,-1.542056


In [16]:
n = 100

In [17]:
#Parameters of the first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

#df_3_1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var)
#df_3_1.to_csv("df_3_1.csv", index=False)

In [26]:
#df_3_1

Unnamed: 0,0,1,2,3,4,5
0,0.093337,0.0,0.0,0.135113,0.167152,0.197447
1,-1.072932,1.0,0.0,1.046237,1.167125,1.134287
2,0.177369,0.0,0.0,0.414308,0.415502,0.471683
3,-0.608092,2.0,2.0,0.327232,0.439984,0.327426
4,1.775482,3.0,3.0,-0.293381,-0.434007,-0.254972
...,...,...,...,...,...,...
95,0.289194,3.0,0.0,-0.401571,-0.320624,-0.361769
96,1.227742,1.0,3.0,0.083648,-0.155891,-0.145247
97,-1.009023,0.0,0.0,-0.235815,-0.186147,-0.259537
98,0.324053,3.0,1.0,-1.140274,-1.182114,-1.254136


In [None]:
#Parameters of the second dataset created in the paper : (3.2.1 Linear and nonlinear relationships)

S = 1   
K = [5]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 3 #number of categorical variables
cat_idx = [3,4,5] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable