### Data generation notebook for data and graphs used for experiments concerning the CSL

In [None]:
### imports
### imports
import itertools
import os
import networkx as nx
from manm_cs.graph import GraphBuilder
from manm_cs.utils import write_single_csv


In [None]:
### parameters
# number of cgms
cgmIdRange = (0,10)
# number of samples per cgm
### let the number of samples be drawn in a separate step [50,100,250,500,1000]
nSamples = 10000
# number of vars per cgm
nVars = [30,40,50]
# range of discrete values
discreteValueRanges = [(2,4)]
# discrete signal to noise ratio (0-1)
discreteSignalToNoiseRatio = 0.85
# edge densities
eDensities = [0.05, 0.1, 0.15, 0.2]
# discreteNodeRatio
dnrs = [0.0, 0.25, 0.5, 0.75, 1.0]
# betaLowerLimit
betaLower = 1.0
# betaUpperLimit
betaUpper = 1.0
# continuous noise 
contNoise = 1.0
# conditional gaussion
cg = False
# scale_parents
scale_parents = True


# functions
# use "quadratic, linear, cos --> equal distribution"
def linear(value):
    return value

def quadratic(value):
    import math
    return math.pow(value,2)

def cos(value):
    import math
    return math.cos(value)

# set the functions
functions = [(0.34,linear), (0.33,quadratic), (0.33,cos)]


try:
    os.mkdir('./csl_data')
except:
    pass
try:
    os.mkdir('./csl_graph')
except:
    pass

In [None]:
### generation
for cgmId in range(cgmIdRange[0],cgmIdRange[1]):
    
    iterParams = [nVars, dnrs, eDensities]
    for nvar, dnr, ed in itertools.product(*iterParams):
        graph = GraphBuilder() \
                .with_num_nodes(nvar) \
                .with_edge_density(ed) \
                .with_discrete_node_ratio(dnr) \
                .with_discrete_signal_to_noise_ratio(discreteSignalToNoiseRatio) \
                .with_min_discrete_value_classes(discreteValueRanges[0][0]) \
                .with_max_discrete_value_classes(discreteValueRanges[0][1]) \
                .with_continuous_noise_std(contNoise) \
                .with_functions(functions) \
                .with_conditional_gaussian(cg) \
                .with_betas(betaLower, betaUpper) \
                .with_scaled_parent_influence(scale_parents) \
                .build()
        dfs = graph.sample(num_observations=nSamples, num_processes=1)
        output_file_name = '_'.join([str(cgmId),str(nvar),str(ed),str(dnr)])
        output_file_name = output_file_name.replace('.','c')
        print(output_file_name)
        write_single_csv(dataframes=dfs, target_path=f"./csl_data/{output_file_name}.csv")
        nx_graph = graph.to_networkx_graph()
        nx.write_gml(nx_graph, f"./csl_graph/{output_file_name}.gml")
    
    

### Transformation

In [None]:
### imports
import pandas as pd
import numpy as np
import os


In [None]:
### parameters
# number of samples
samples = [50,100,250,500,1000]
try:
    os.mkdir('./csl_data_normalized')
except:
    pass

In [None]:
### function
def normal_transform(x,dis):
    if np.unique(x, axis=0).shape[0] > dis:
        x_min = np.min(x, axis=0)
        x_max = np.max(x, axis=0)
        return (x-x_min)/(x_max-x_min)
    else:
        return x


In [None]:
### transformation
directory = os.fsencode('./csl_data/')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if '.csv' in filename:
        for sample in samples:
            df = pd.read_csv('./csl_data/'+filename)
            sdf = df.head(sample)
            for col in sdf:
                sdf[col] = normal_transform(sdf[col],10)
            sdf.to_csv('csl_data_normalized/'+filename[:-4]+'_'+str(sample)+'.csv',index=False)
        