In this notebook, we will first introduce the technique for generating synthetic datasets and then explain how to generated synthetic dataframes for testing properties lately. 

### 0- Library importation

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from src.utils import *
from src.algorithms import * 
from src.metrics_FAMD import *

### 1- Let's create our first synthetic dataset!  

In [3]:
# Set random seed to create reproductible results
np.random.seed(21032024)

In [5]:
# Defintion of parameter for synthetic dataset creation

n = 100 #  Number of samples 
S = 2  # Underlying dimensions
K = [1,3]  # K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 2 # Number of categorical variables
cat_idx = [1,2] # Index of the categorical variables
nb_of_cat_per_var = [4,4] # Number of categories for each categorical variable

SNR = 3 # Signal noise ratio

In [11]:
# Creation of our first synthetic and complete dataset 
df_ref = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR)
df_ref

Unnamed: 0,0,1,2,3,4,5
0,-0.355704,2.0,1.0,-0.134909,-1.067431,-0.620024
1,1.130074,2.0,0.0,-0.072465,1.014914,0.714236
2,0.955670,1.0,1.0,-1.061916,-0.474874,-0.707344
3,0.546514,3.0,3.0,1.645276,1.017216,1.346527
4,-1.267910,3.0,3.0,0.293491,0.304471,1.312991
...,...,...,...,...,...,...
95,1.636741,3.0,1.0,0.272213,0.598163,0.549963
96,-0.789136,0.0,0.0,1.389421,0.402969,0.116653
97,0.522099,3.0,2.0,0.721871,1.239337,0.818064
98,-0.310221,1.0,1.0,-0.942637,-0.277487,-0.838604


*Remark 1: Variables 1 and 2 are categorical variables with more than one category. Before proceeding with iFAMD imputation, we need to encode them with dummy variables.*

In [12]:
print("No missing values generated: ",np.where(df_ref.isna()==True), "\n") # Check that there are no missing values 

No missing values generated:  (array([], dtype=int64), array([], dtype=int64)) 



*Remark 2: In order to enter the frame of the study, we will need to create missingness artificially by masking some values. This technique allows us to have a ground truth to compare performances of the imputation algorithm.*

## 

### 2- Synthetic dataframe generation for property testing

#### 2.1 Relationships between continuous and categorical variable

In [None]:
#Parameters of the first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

S = 2  # Underlying dimensions
K = [1,3]  # K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 # Number of categorical variables
cat_idx = [1,2] # Index of the categorical variables
nb_of_cat_per_var = [4,4] # Number of categories for each categorical variable

Generation of dataframes with varying SNR

In [None]:
df_3_1_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 1)
df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 3)

### 2.2 Linear and nonlinear relationships

In [None]:
#Parameters of the second linear dataset created in the paper : (3.2.1 Linear and nonlinear relationships)

S = 1   # Underlying dimensions
K = [4]  # K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 1 # Number of categorical variables
cat_idx = [4] #i Index of the categorical variables
nb_of_cat_per_var = [10] # Number of categories for each categorical variable
SNR = 5

Generation of dataframes with linear and non linear relationships

In [None]:
df_3_2_linear = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = SNR)
df_3_2_nonlinear = df_3_2_linear.copy()
df_3_2_nonlinear["1"] = df_3_2_nonlinear["1"]**2
df_3_2_nonlinear["2"] = np.cos(df_3_2_nonlinear["2"])

### 2.3 Imputation of rare categories

Generation of dataframes with rare categories

In [None]:
# f defines the frequency of a (rare) category

# For n = 100
df_rare_f10_n100 = create_rare_df(0.1,100) # f = 0.1
df_rare_f4_n100 = create_rare_df(0.04,100) # f = 0.04

# For n = 1000
df_rare_f10_n1000 = create_rare_df(0.1,1000) # f = 0.1
df_rare_f4_n1000 = create_rare_df(0.04,1000) # f = 0.04
df_rare_f1_n1000 = create_rare_df(0.01,1000) # f = 0.01
df_rare_f04_n1000 = create_rare_df(0.004,1000) # f = 0.004

### 2.4 Choice of the number of dimensions

In [None]:
S = 2   # Underlying dimensions
K = [7,3]  # K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 6 # Number of categorical variables
cat_idx = [5,6,7,8,10,11] # Index of the categorical variables
nb_of_cat_per_var = np.full((cat),3) # Number of categories for each categorical variable

Generation of dataframes with varying number of dimensions

In [None]:
df_3_4_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 1)
df_3_4_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR = 3)

### 3- Save dataframes

If you would like to proceed onto saving the created dataframes in "/df", declare decision as True (defaults to False). 

In [None]:
decision = False 

if decision: 

    # Relationship between continuous and categorical variables 
    df_3_1_snr1.to_csv("datasets/df_3_1_snr1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_3_1_snr3.to_csv("datasets/df_3_1_snr3.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)


    # Datasets: Linear and Non-linear relationship 
    df_3_2_linear.to_csv("datasets/df_3_2_linear.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_3_2_nonlinear.to_csv("datasets/df_3_2_nonlinear.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)


    # Rare categories
    df_rare_f10_n100.to_csv("datasets/df_rare_f10.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_rare_f4_n100.to_csv("datasets/df_rare_f4_n100.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_rare_f10_n1000.to_csv("datasets/df_rare_f10_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_rare_f4_n1000.to_csv("datasets/df_rare_f4_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_rare_f1_n1000.to_csv("datasets/df_rare_f1_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_rare_f04_n1000.to_csv("datasets/df_rare_f04_n1000.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

    ## NB: In those rare df, the rare values are the "0.0" values in the last two columns.

    # Choice number of dimensions
    df_3_4_snr1.to_csv("datasets/df_3_4_snr1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    df_3_4_snr3.to_csv("datasets/df_3_4_snr3.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)