### Data generation notebook for data used for experiments concerning the CI test (i.e., Calibration, Robustness, Comparsion)

In [None]:
### imports
import multiprocessing
from multiprocessing import Pool
import os
import itertools
import random
import csv
import math

## manm_cs imports
import manm_cs.graph as manmg
import manm_cs.variables as manmv
import manm_cs.noise as manmn
import manm_cs.prob_distributions as manmp


In [None]:
### parameters
# number of cores for parallel data generation 
processes = max(1, multiprocessing.cpu_count()-1)

# number of datasets generated
dataGenerationRuns = 100
# number of samples
nsamples = 10000
# size of separation set dZ
nsep = [1,3,5,7]

# discrete value ranges, as pairs of (min,max)
discreteValueRanges = [(2,4)]
# discrete signal to noise ratio (0-1)
discreteSignalToNoiseRatios = 0.85
# discrete Node Ratio
discreteNodeRatio = [0.0, 0.25, 0.5, 0.75, 1.0] 
# betaLimits
betaLower = 1.0
betaUpper = 1.0
# continuous noise default 
contNoise = 1.0

# functions
# use "quadratic, linear, cos --> equal distribution"
def linear(value):
    return value

def quadratic(value):
    import math
    return math.pow(value,2)

def cos(value):
    import math
    return math.cos(value)

### scaling of parent option
withScaledParents = True

functions = [linear, quadratic, cos]

### function to write sample csv
def write_single_csv(dataframes, target_path):
    dataframes[0].to_csv(target_path, index=False)
    for df in dataframes[1:]:
        df.to_csv(target_path, mode='a', header=False, index=False)

conf={'func':functions,'discreteSignalToNoiseRatios':discreteSignalToNoiseRatios,
     'nsamples':nsamples,'nsep':nsep,'discreteValueRanges':discreteValueRanges,
     'discreteNodeRatios':discreteNodeRatio,'betaL':betaLower,'betaU':betaUpper,
     'contNoise':contNoise, 'withScaledParents': withScaledParents}

try:
    os.mkdir('./ci_data')
except:
    pass


In [None]:
### function to call for data generation

def dataGenerationRun(dataRun):

    paramList = [conf['discreteNodeRatios'], conf['nsep'], ['withEdge','noEdge']]
    for paramSet in itertools.product(*paramList):
        discreteValueRangeLow = random.randint(conf['discreteValueRanges'][0][0], conf['discreteValueRanges'][0][1])
        discreteValueRangeUp = random.randint(discreteValueRangeLow, conf['discreteValueRanges'][0][1])
        discNodeRatio = paramSet[0]
        ### build graph
        ### construct parents first
        parents = []
        idx = 0
        graphBuilder = manmg.GraphBuilder().with_discrete_signal_to_noise_ratio(conf['discreteSignalToNoiseRatios']) \
                .with_min_discrete_value_classes(discreteValueRangeLow) \
                .with_max_discrete_value_classes(discreteValueRangeUp) \
                .with_scaled_parent_influence(conf['withScaledParents'])
        funcs = []
        betas = []
        for i in range(0, paramSet[1]):
            if random.random() < discNodeRatio:
                parents.append(graphBuilder.generate_discrete_variable([], idx))
            else:
                parents.append(manmv.continuous_variable.ContinuousVariable(idx=idx,
                                                                            noise=manmn.GaussianNoiseBuilder().with_sigma(sigma=conf['contNoise']).build(),
                                                                            scale_parents = True))
                funcs.append(conf['func'][random.randint(0,len(functions)-1)])
                betas.append(random.uniform(conf['betaL'], conf['betaU']))
            idx += 1
        variables = []
        variables += parents
        if paramSet[2] =='noEdge':
        ### add two child nodes
            for i in range(2):
                if random.random() < discNodeRatio:
                    variables.append(graphBuilder.generate_discrete_variable(parents, idx + i))
                else:
                    variables.append(manmv.continuous_variable.ContinuousVariable(idx=idx + i, parents=parents, functions=funcs, betas=betas,
                                                                                  noise=manmn.GaussianNoiseBuilder().with_sigma(sigma=conf['contNoise']).build(),
                                                                                  scale_parents = True))
        elif paramSet[2] == 'withEdge':
            parent2 = None
            func2 = []
            beta2 = []
            if random.random() < discNodeRatio:
                parent2 = graphBuilder.generate_discrete_variable(parents, idx)
            else:
                parent2 = manmv.continuous_variable.ContinuousVariable(idx=idx, parents=parents, functions=funcs, betas=betas,
                                                                       noise=manmn.GaussianNoiseBuilder().with_sigma(sigma=conf['contNoise']).build(),
                                                                       scale_parents = True)
                func2.append(conf['func'][random.randint(0,len(functions)-1)])
                beta2.append(random.uniform(conf['betaL'], conf['betaU']))
            variables.append(parent2)
            if random.random() < discNodeRatio:
                variables.append(graphBuilder.generate_discrete_variable([parent2], idx + 1))
            else:
                variables.append(manmv.continuous_variable.ContinuousVariable(idx=idx + 1, parents=[parent2] + parents, functions=func2 + funcs, betas=beta2 + betas,
                                                                              noise=manmn.GaussianNoiseBuilder().with_sigma(sigma=conf['contNoise']).build(),
                                                                              scale_parents = True))

        ###build graph
        g = manmg.Graph(variables=variables)
        print(g)
        varList = ['Continuous' if x.type == manmv.VariableType.CONTINUOUS else "Discrete" for x in variables]
        ### store graph to check if everything is set up correclty
        ### sample data
        dfs = g.sample(conf['nsamples'],1)
        ### store data
        write_single_csv(dataframes=dfs, target_path='./ci_data/'+str(dataRun)+'_dnr_'+ str(paramSet[0]).replace('.','_') +'_sepsize_'+str(paramSet[1])+"_"+paramSet[2]+'.csv')


In [None]:
### parallel data generation
configs = [i for i in range(0, dataGenerationRuns)]

with Pool(processes=processes) as pool:
    result = pool.map(dataGenerationRun, configs)

### Transformation

In [None]:
### imports
import pandas as pd
import numpy as np
import os

In [None]:
### parameters
# number of samples
samples = [50,100,250,500,1000]
try:
    os.mkdir('./ci_data_normalized')
except:
    pass

In [None]:
### function
def normal_transform(x,dis):
    if np.unique(x, axis=0).shape[0] > dis:
        x_min = np.min(x, axis=0)
        x_max = np.max(x, axis=0)
        return (x-x_min)/(x_max-x_min)
    else:
        return x

In [None]:
### transformation
directory = os.fsencode('./ci_data/')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    for sample in samples:
        df = pd.read_csv('./ci_data/'+filename)
        sdf = df.head(sample)
        for col in sdf:
            sdf[col] = normal_transform(sdf[col],10)
        sdf.to_csv('ci_data_normalized/'+filename[:-4]+'_'+str(sample)+'.csv',index=False)
