In [52]:
from modules import data 
from sklearn.preprocessing import LabelEncoder
import pandas as pd

sig_list = pd.read_excel('./data/sigs.xlsx').iloc[:, 0].to_list()
records = data.from_json('./data/json/ind.json')
label_encoder = LabelEncoder()

for record in records:
    record.min_quality(90)
    record.sig_filter(sig_list)
    
records_df = data.to_df(records)
labels = records_df['Label']
labels = label_encoder.fit_transform(labels)
records_df = data.standard_scaler(records_df.drop('Label', axis=1))
records_df = records_df.sort_index(axis=1)
records_df['Label'] = labels

records_df

Compound,"2,4-Dimethyl-1-Heptene",2-Octanone,"2-Propenoic Acid, 2-Ethylhexyl Ester","2H-Pyran, 2-Ethoxy-3,4-Dihydro","Benzene, 1,3-Bis(1,1-Dimethylethyl)","Benzenemethanol, .Alpha.-Methyl-,Acetate",Decane,"Heptane, 3-Methylene","Octane, 4-Methyl","Pyrazine, 2,5-Dimethyl","Pyrazine, 2-Ethyl-6-Methyl","Undecane, 2-Methyl",Label
Ae LB BS 0h 1,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,-0.250930,-0.242499,-0.266833,-0.385571,-0.216286,-0.318946,0
Ae LB BS 24h 1,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,-0.400475,-0.242499,-0.414593,-0.359123,-0.216286,-0.318946,0
Ae LB BS 3h 1,0.122919,-0.098141,-0.10028,-0.108069,-0.200083,-0.135291,-0.137516,-0.242499,-0.183342,-0.257642,-0.216286,-0.318946,0
Ae LB BS 9h 1,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,-0.400475,-0.242499,-0.414593,-0.306654,-0.216286,-0.318946,0
Ae LB Ctrl 0h 1,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,-0.400475,-0.242499,-0.370298,-0.420724,-0.216286,-0.318946,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
An TSB EC 9h 2,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,1.112307,-0.242499,-0.414593,1.570522,-0.216286,-0.318946,30
An TSB SA 0h 2,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,-0.400475,-0.242499,-0.414593,-0.420724,-0.216286,-0.318946,31
An TSB SA 24h 2,-0.261246,-0.098141,-0.10028,-0.108069,0.106905,-0.135291,1.679154,-0.242499,-0.414593,2.021104,-0.216286,-0.318946,31
An TSB SA 3h 2,-0.261246,-0.098141,-0.10028,-0.108069,-0.263297,-0.135291,0.671535,-0.242499,-0.414593,2.392083,-0.216286,-0.318946,31


In [53]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Sequential
import numpy as np

def wasserstein_loss(y_true, y_pred):
    return tf.reduce_mean(y_true * y_pred)

def make_generator():
    model = Sequential([
        Input(shape=(100,)),
        Dense(64, activation='relu'),
        Dense(128, activation='relu'),
        Dense(12),
    ])
    
    return model

def make_discriminator():
    model = Sequential([
        Input(shape=(12,)),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1),
    ])
    return model
    

In [54]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.data import Dataset

latent_dim = 100
batch_size = 8
epochs = 100

loss = BinaryCrossentropy(from_logits=True)
generator = make_generator()
discriminator = make_discriminator()

generator_optimizer = Adam(1e-4)
discriminator_optimizer = Adam(1e-4)

def discriminator_loss(real_output, fake_output):
    real_loss = loss(tf.ones_like(real_output), real_output)
    fake_loss = loss(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return loss(tf.ones_like(fake_output), fake_output)

@tf.function
def train_step(input):
    noise = tf.random.normal([batch_size, latent_dim])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_output = generator(noise, training=True)
        
        real_output = discriminator(input, training=True)
        fake_output = discriminator(generated_output, training=True)
        
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
    generator_grad = gen_tape.gradient(gen_loss, generator.trainable_variables)
    discrim_grad = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    
    generator_optimizer.apply_gradients(zip(generator_grad, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(discrim_grad, discriminator.trainable_variables))
    
    return gen_loss, disc_loss
    
def train(data, epochs):
    volatolomics_data = Dataset.from_tensor_slices(data).shuffle(1000).batch(32)
    volatolomics_data = volatolomics_data.repeat()
    iterator = iter(volatolomics_data)
    for epoch in range(epochs):
        num_batches_per_epoch = 5
        for _ in range(num_batches_per_epoch):
            batch = next(iterator)
            gen_loss, disc_loss = train_step(batch)

        if (epoch + 1) % 10 == 0:
            print(gen_loss.numpy(), disc_loss.numpy())


In [55]:
train(records_df.drop('Label', axis=1).values.astype('float32'), 1000)

0.62162113 1.4379536
0.60653317 1.4579382
0.7470155 1.3325738
0.69890904 1.3063258
0.5737492 1.5023197
0.86746514 1.2840385
0.71593106 1.3297937
0.731467 1.2871959
0.7234854 1.3163625
0.69285893 1.344765
0.7046281 1.3007693
0.7315173 1.2664223
0.7029525 1.32797
0.7511776 1.3492517
0.72054315 1.2845613
0.6868619 1.320889
0.7319204 1.2873018
0.73887163 1.2927701
0.66415083 1.3921492
0.675035 1.3440104
0.7052111 1.3447908
0.72586584 1.3568139
0.7197219 1.3085854
0.7495538 1.2873755
0.74063206 1.3264241
0.73256934 1.3229357
0.78584063 1.2542126
0.7109412 1.3427963
0.70131904 1.2794025
0.7488361 1.27394
0.7565284 1.3164554
0.73515254 1.258148
0.7452307 1.2261543
0.7439856 1.3160605
0.73800004 1.3100076
0.72740656 1.3510536
0.72267723 1.3422124
0.7941972 1.2534839
0.77855957 1.2812952
0.756616 1.2833537
0.7343583 1.3038495
0.8140504 1.3451359
0.74322534 1.4012839
0.75039744 1.3783002
0.74703765 1.3161157
0.7647114 1.2982569
0.7305584 1.2890458
0.7295272 1.3464413
0.76688814 1.3193254
0.76102

KeyboardInterrupt: 

In [None]:
# volatolomics_data = records_df.drop('Label', axis=1).values.astype('float32')
# padding_amount = (epochs * batch_size) - len(volatolomics_data)

# train_dataset = tf.data.Dataset.from_tensor_slices(volatolomics_data).shuffle(1000).batch(batch_size)
# padded_dataset = train_dataset.repeat((padding_amount // len(volatolomics_data)) + 1)
# # padded_dataset = padded_dataset.reshuffle(len(volatolomics_data) * (padding_amount // len(volatolomics_data)) + len(volatolomics_data))

# for epoch in range(epochs):
#   iterator = iter(padded_dataset)
#   while True:
#       try:
#           real_samples = next(iterator)
#           gen_loss, disc_loss = train_step(generator, discriminator, real_samples, batch_size, latent_dim)
#       except StopIteration:
#           break
#   if epoch % 10 == 0:
#       print(f"Epoch {epoch}, Generator Loss: {gen_loss.numpy()}, Discriminator Loss: {disc_loss.numpy()}")
