## Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

: 

### Loading the Dataset

In [2]:
df = pd.read_csv("datasets/creditcard_balanced.csv")

df.head()

Unnamed: 0,V4,V10,V11,V12,V14,V17,Class
0,1.378155,0.090794,-0.5516,-0.617801,-0.311169,0.207971,0
1,0.448154,-0.166974,1.612727,1.065235,-0.143772,-0.114805,0
2,0.37978,0.207643,0.624501,0.066084,-0.165946,1.109969,0
3,-0.863291,-0.054952,-0.226487,0.178228,-0.287924,-0.684093,0
4,0.403034,0.753074,-0.822843,0.538196,-1.11967,-0.237033,0


### Selecting Features and Target

In [3]:
x = df.drop("Class", axis=1)
y = df["Class"]

### Normalize the PCA-treated features

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))
x_scaled = scaler.fit_transform(x)

### Convert to numpy arrays

In [5]:
x_scaled = np.array(x_scaled)
y = np.array(y)

## cGAN (conditional GAN)

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input,
    Concatenate,
    Dense,
    LeakyReLU,
    BatchNormalization,
    Dropout,
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

2024-10-17 01:39:38.450948: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Define the generator model

In [7]:
def build_generator(noise_dim, label_dim, output_dim):
    noise_input = Input(shape=(noise_dim,))
    label_input = Input(shape=(label_dim,))
    merged_input = Concatenate()([noise_input, label_input])

    x = Dense(128)(merged_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    output = Dense(output_dim, activation="tanh")(x)

    model = Model([noise_input, label_input], output)
    return model

### Define the discriminator model

In [8]:
def build_discriminator(input_dim, label_dim):
    data_input = Input(shape=(input_dim,))
    label_input = Input(shape=(label_dim,))
    merged_input = Concatenate()([data_input, label_input])

    x = Dense(512)(merged_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dropout(0.3)(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dropout(0.3)(x)
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.2)(x)
    output = Dense(1, activation="sigmoid")(x)

    model = Model([data_input, label_input], output)
    return model

### Set dimensions

In [9]:
noise_dim = 100  # Dimension of the noise vector
label_dim = 1  # Binary label (0 or 1)
output_dim = x_scaled.shape[1]  # Number of PCA components

### Define loss function and optimizers

In [10]:
cross_entropy = BinaryCrossentropy(from_logits=False)
generator_optimizer = Adam(1e-4)
discriminator_optimizer = Adam(1e-4)

2024-10-17 01:39:41.984387: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


### Instantiate models

In [11]:
generator = build_generator(noise_dim, label_dim, output_dim)
discriminator = build_discriminator(output_dim, label_dim)

### Define the training step

In [12]:
@tf.function
def train_step(real_data, real_labels):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])
    fake_labels = tf.random.uniform([BATCH_SIZE, 1], minval=0, maxval=2, dtype=tf.int32)
    fake_labels = tf.cast(fake_labels, tf.float32)

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_data = generator([noise, fake_labels], training=True)

        real_output = discriminator([real_data, real_labels], training=True)
        fake_output = discriminator([generated_data, fake_labels], training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        real_loss = cross_entropy(tf.ones_like(real_output), real_output)
        fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
        disc_loss = real_loss + fake_loss

        gradients_of_generator = gen_tape.gradient(
            gen_loss, generator.trainable_variables
        )
        gradients_of_discriminator = disc_tape.gradient(
            disc_loss, discriminator.trainable_variables
        )

        generator_optimizer.apply_gradients(
            zip(gradients_of_generator, generator.trainable_variables)
        )
        discriminator_optimizer.apply_gradients(
            zip(gradients_of_discriminator, discriminator.trainable_variables)
        )

    return gen_loss, disc_loss

### Training loop

In [13]:
EPOCHS = 100
BATCH_SIZE = 256
BUFFER_SIZE = x_scaled.shape[0]

# Prepare the dataset
train_dataset = (
    tf.data.Dataset.from_tensor_slices((x_scaled, y))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
)

for epoch in range(EPOCHS):
    for real_data, real_labels in train_dataset:
        g_loss, d_loss = train_step(real_data, real_labels)

    print(f"Epoch {epoch}, Gen Loss: {g_loss.numpy()}, Disc Loss: {d_loss.numpy()}")

2024-10-17 01:40:17.405835: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [474592]
	 [[{{node Placeholder/_1}}]]
2024-10-17 01:40:17.406179: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [474592]
	 [[{{node Placeholder/_1}}]]


Epoch 0, Gen Loss: 0.9166416525840759, Disc Loss: 1.2155795097351074
Epoch 1, Gen Loss: 1.0170763731002808, Disc Loss: 1.2108755111694336
Epoch 2, Gen Loss: 0.9701282978057861, Disc Loss: 1.2651163339614868
Epoch 3, Gen Loss: 0.853053629398346, Disc Loss: 1.265821099281311
Epoch 4, Gen Loss: 0.837211012840271, Disc Loss: 1.2979774475097656
Epoch 5, Gen Loss: 0.9124760627746582, Disc Loss: 1.2265633344650269
Epoch 6, Gen Loss: 0.8193730711936951, Disc Loss: 1.2691962718963623
Epoch 7, Gen Loss: 0.8139760494232178, Disc Loss: 1.333698034286499
Epoch 8, Gen Loss: 0.8408277630805969, Disc Loss: 1.2767391204833984
Epoch 9, Gen Loss: 0.7733614444732666, Disc Loss: 1.3379660844802856
Epoch 10, Gen Loss: 0.8587377667427063, Disc Loss: 1.2894034385681152
Epoch 11, Gen Loss: 0.8073910474777222, Disc Loss: 1.3393070697784424
Epoch 12, Gen Loss: 0.8257250189781189, Disc Loss: 1.31252121925354
Epoch 13, Gen Loss: 0.7807550430297852, Disc Loss: 1.3085529804229736
Epoch 14, Gen Loss: 0.81974643468856

### Generate synthetic data

In [15]:
def generate_synthetic_data(generator, num_samples, noise_dim, label_dim, class_label):
    noise = tf.random.normal([num_samples, noise_dim])
    labels = np.full((num_samples, label_dim), class_label)
    generated_data = generator([noise, labels], training=False)
    return generated_data

### Generate 1000 synthetic samples for each class

In [16]:
class_labels = [0, 1]  # Binary labels
synthetic_data = []
synthetic_labels = []

for class_label in class_labels:
    synthetic_samples = generate_synthetic_data(
        generator, 1000, noise_dim, label_dim, class_label
    )
    synthetic_data.append(synthetic_samples)
    synthetic_labels.extend([class_label] * 1000)

synthetic_data = np.vstack(synthetic_data)
synthetic_labels = np.array(synthetic_labels)

### Inverse transform to original scale

In [17]:
synthetic_data = scaler.inverse_transform(synthetic_data)

### Convert to DataFrame

In [18]:
synthetic_data = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_data, columns=x.columns)
synthetic_df["Class"] = synthetic_labels

...

In [19]:
synthetic_df.shape

(2000, 7)

In [20]:
synthetic_df.head()

Unnamed: 0,V4,V10,V11,V12,V14,V17,Class
0,-3.638477,1.337374,-5.469562,0.017342,-1.322709,-0.913055,0
1,-3.064563,1.893935,4.957232,-0.327365,-0.216767,0.309035,0
2,4.499691,-0.157214,3.028811,3.47527,-0.382252,-0.964402,0
3,-0.838434,-0.573273,-3.976091,-0.4938,0.118743,-1.246866,0
4,-4.337239,-1.622592,-2.131279,0.611594,0.408417,-2.078907,0


In [21]:
synthetic_df["Class"].value_counts()

Class
0    1000
1    1000
Name: count, dtype: int64

### Export Synthetic Dataset

In [22]:
synthetic_df.to_csv("datasets/creditcard_synthetic.csv", index=False)