Let's generate synthetic datasets !

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from src.utils import *
from src.algorithms import * 
from src.metrics_FAMD import *

In [5]:
# Set random seed to create reproductible results
np.random.seed(21032024)

# Sample size :
n = 100

3.1 Relationships between continuous and categorical variable

In [6]:
#Parameters of the first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable


# NB: high SNR implies that the variables in each group are very linked

df_3_1_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 1)
df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 3)
#df_3_1_snr1.to_csv("datasets/df_3_1_snr1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
#df_3_1_snr3.to_csv("datasets/df_3_1_snr3.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

3.2.1 Linear and nonlinear relationships

In [23]:
#Parameters of the second linear dataset created in the paper : (3.2.1 Linear and nonlinear relationships)

S = 1   
K = [4]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 1 #number of categorical variables
cat_idx = [4] #index of the categorical variables
nb_of_cat_per_var = [10] #number of categories for each categorical variable
SNR = 5

df_3_2_linear = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = SNR)
df_3_2_linear.to_csv("datasets/df_3_2_linear.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

#create the nonlinear version of the previous dataset :
df_3_2_nonlinear = df_3_2_linear.copy()
df_3_2_nonlinear["1"] = df_3_2_nonlinear["1"]**2
df_3_2_nonlinear["2"] = np.cos(df_3_2_nonlinear["2"])
df_3_2_nonlinear.to_csv("datasets/df_3_2_nonlinear.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

3.3 Imputation of rare categories

In [24]:
#Parameters of the rare categories dataset created in the paper : (3.3 Imputation of rare categories)
S = 1   
K = [4]
SNR = 5
f = 0.1 # frequency of the rare category

def create_rare_df(f,n):
    """Create dataset with rare categories, as described in section 3.3 of the paper
    f (float) : frequency of the rare categories
    n (int) : sample size
    """
     
    # Create df with continuous variables
    df_rare = create_dataset(n, S, K, 0, [], [], SNR = SNR)

    # Create the rare categorical variables, following the paper method :
    cat = 3 #number of categorical variables
    cat_idx = ["2","3","4"] #index of the categorical variables
    nb_of_cat_per_var = [3,3,3] #number of categories for each categorical variable


    # first cat variable :
    idx_var = cat_idx[0] 
    nb_of_cat = nb_of_cat_per_var[0]
    df_i = df_rare[idx_var] 
    indices = np.arange(n)
    np.random.shuffle(indices)
    indices_cat = np.array_split(indices, nb_of_cat)
    for j in range(nb_of_cat):
        for ind in indices_cat[j]:
            df_i[ind] = j
    df_rare[idx_var] = df_i


    # code the two linked rare categorical variables :
    nb_of_cat = nb_of_cat_per_var[1]
    df_1 = df_rare[cat_idx[1]]
    df_2 = df_rare[cat_idx[2]]
    np.random.shuffle(indices)
    nb_rare = int(f * n)
    indices_rare = indices[0:nb_rare]
    indices_non_rare = np.setdiff1d(indices, indices_rare)
    np.random.shuffle(indices_non_rare)
    for ind in indices_rare:
            df_1[ind] = 0
            df_2[ind] = 0

    indices_cat1 = np.array_split(indices_non_rare, nb_of_cat-1)
    for j in range(0,nb_of_cat-1):
        for ind in indices_cat1[j]:
            df_1[ind] = j+1
    df_rare[cat_idx[1]] = df_1

    np.random.shuffle(indices_non_rare)
    indices_cat2 = np.array_split(indices_non_rare, nb_of_cat-1)
    for j in range(0,nb_of_cat-1):
        for ind in indices_cat2[j]:
            df_2[ind] = j+1
    df_rare[cat_idx[2]] = df_2

    return df_rare





In [25]:
df_rare_f10_n100 = create_rare_df(0.1,100)
df_rare_f4_n100 = create_rare_df(0.04,100)
df_rare_f10_n1000 = create_rare_df(0.1,1000)
df_rare_f4_n1000 = create_rare_df(0.04,1000)
df_rare_f1_n1000 = create_rare_df(0.01,1000)
df_rare_f04_n1000 = create_rare_df(0.004,1000)


In those rare df, the rare values are the "0.0" values in the last two columns.

In [26]:
df_rare_f10_n100.to_csv("datasets/df_rare_f10.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_rare_f4_n100.to_csv("datasets/df_rare_f4_n100.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_rare_f10_n1000.to_csv("datasets/df_rare_f10_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_rare_f4_n1000.to_csv("datasets/df_rare_f4_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_rare_f1_n1000.to_csv("datasets/df_rare_f1_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_rare_f04_n1000.to_csv("datasets/df_rare_f04_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

3.4 Choice of the number of dimensions

In [27]:
S = 2   
K = [7,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 6 #number of categorical variables
cat_idx = [5,6,7,8,10,11] #index of the categorical variables
nb_of_cat_per_var = np.full((cat),3) #number of categories for each categorical variable

df_3_4_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 1)
df_3_4_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 3)
df_3_4_snr1.to_csv("datasets/df_3_4_snr1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
df_3_4_snr3.to_csv("datasets/df_3_4_snr3.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
