# Creating datasets from synthetically generated models

In [1]:
import tellurium as te
import teUtils as tu
import os
import re
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
FOLDER_NAME = '20sp_w-Allo/'
DATA_FOLDER = FOLDER_NAME + 'generated_data/'
PERTURBATION_LEVELS = [10, 50] # in percent
NOISE_LEVEL = [10, 50] # in percent

passN = 2 # what position is the passNumber? e.g. 2 for "mass_action_152.xml" and 1 for "test_152.xml"

## Generating perturbation datasets 

In [3]:
# first create directory to hold the data
try: 
    os.mkdir(DATA_FOLDER)
except: 
    pass

In [4]:
with open(FOLDER_NAME + "passlist.txt") as file:
    passlist = [line.rstrip() for line in file]

Develop perturbation datasets for all models within the FOLDER_NAME.

In [5]:
for pl in PERTURBATION_LEVELS:
    for modelPath in tqdm(os.listdir(FOLDER_NAME + 'teVer/')):
        r = te.loada(FOLDER_NAME + 'teVer/' + modelPath)
        
        exMet = r.getBoundarySpeciesIds()
        inMet = r.getFloatingSpeciesIds()
        fluxnums = range(len(r.getReactionRates()))
        fluxIDs = ['flux_' + str(num) for num in fluxnums]
        e_list = [i for i in r.getGlobalParameterIds() if 'E' in i]   
        
        pertLevel = pl/100 
        perturbation_level = [1 - pertLevel, 1 + pertLevel]
        
        header = e_list + exMet + inMet + fluxIDs        

        modelNo = re.split(r'[_|.]', modelPath)[passN]
        
        with open(DATA_FOLDER + f'data_{modelNo}_pt{pl}.csv', 'w', encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(header)
            
            try: # base case
                spConc = list(r.simulate(0,1000000)[-1])[1:]
                r.conservedMoietyAnalysis = True
                r.steadyState()
                
                enzymes = [r.getValue(e) for e in e_list]
                exMet_values = [r.getValue(m) for m in exMet]
                fluxes = list(r.getReactionRates())

                writer.writerow(enzymes + exMet_values + spConc + fluxes)
                
                # perturbed enzyme cases
                for params in e_list:
                    for level in perturbation_level:
                        r.resetToOrigin()
                        r.setValue(params, level*r.getValue(params))
                        
                        spConc = list(r.simulate(0,1000000)[-1])[1:]
                        r.steadyState()
                        enzymes = [r.getValue(e) for e in e_list]
                        exMet_values = [r.getValue(m) for m in exMet]
                        fluxes = list(r.getReactionRates())
                        
                        writer.writerow(enzymes + exMet_values + spConc + fluxes)
                
                # perturbed boundary species cases
                for params in exMet:
                    for level in perturbation_level:
                        r.resetToOrigin()
                        r.setValue(params, level*r.getValue(params))
                        
                        spConc = list(r.simulate(0,1000000)[-1])[1:]
                        r.steadyState()
                        enzymes = [r.getValue(e) for e in e_list]
                        exMet_values = [r.getValue(m) for m in exMet]
                        fluxes = list(r.getReactionRates())
                        
                        writer.writerow(enzymes + exMet_values + spConc + fluxes)        
            except:
                pass #print('error')


100%|██████████| 13/13 [00:03<00:00,  3.44it/s]
100%|██████████| 13/13 [00:03<00:00,  3.48it/s]


## Adding Noise

In [8]:
NOISY_DATA_FOLDER = FOLDER_NAME + 'noisy_generated_data/'
# first create directory to hold the data
try: 
    os.mkdir(NOISY_DATA_FOLDER)
except: 
    pass

In [9]:
datafiles = [f for f in os.listdir(DATA_FOLDER)]

for f in datafiles: 
    df = (pd.read_csv(DATA_FOLDER + f))

    for nl in NOISE_LEVEL:
        noiseLevel = nl/100 
        noise = np.random.uniform(low=1-noiseLevel, high=1+noiseLevel, size=df.shape) 
        noised_data = df.multiply(noise)
        fileName = f.replace('.csv', f'_n{nl}.csv')
        noised_data.to_csv(NOISY_DATA_FOLDER + fileName)

## Plotting analysis of models