In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from sklearn.preprocessing import MinMaxScaler

In [16]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data['Diagnosis'] = data['Diagnosis'].replace({'M': 0, 'B': 1})
    return data

def normalize_data(data):
    scaler = MinMaxScaler()
    data_normalized = scaler.fit_transform(data)
    return data_normalized, scaler

In [None]:
def build_generator(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(output_dim, activation='tanh'))
    return model

def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(1, activation='sigmoid'))
    return model

def build_gan(generator, discriminator):
    # Compile GAN
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

def train_gan(gan, generator, discriminator, data, epochs=10000, batch_size=32, noise_dim=100):
    half_batch = batch_size // 2
    for epoch in range(epochs):
        # Train discriminator
        idx = np.random.randint(0, data.shape[0], half_batch)
        real_data = data[idx]

        noise = np.random.normal(0, 1, (half_batch, noise_dim))
        fake_data = generator.predict(noise)

        d_loss_real = discriminator.train_on_batch(real_data, np.ones((half_batch, 1)))
        d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((half_batch, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train generator
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        valid_y = np.array([1] * batch_size)

        g_loss = gan.train_on_batch(noise, valid_y)

        if epoch % 100 == 0:
            print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")

def generate_synthetic_data(generator, scaler, num_samples, noise_dim=100):
    noise = np.random.normal(0, 1, (num_samples, noise_dim))
    synthetic_data = generator.predict(noise)
    synthetic_data = scaler.inverse_transform(synthetic_data)
    return synthetic_data

In [None]:
file_path = 'datasets/breast.csv'
data = load_data(file_path)

In [None]:
data_normalized, scaler = normalize_data(data)
input_dim = data_normalized.shape[1]
noise_dim = 100

generator = build_generator(noise_dim, input_dim)
discriminator = build_discriminator(input_dim)

discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
gan = build_gan(generator, discriminator)
gan.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
train_gan(gan, generator, discriminator, data_normalized, epochs=10000, batch_size=32, noise_dim=noise_dim)

In [19]:
num_samples = 1000  # Number of synthetic samples you want to generate
synthetic_data = generate_synthetic_data(generator, scaler, num_samples)
synthetic_data_df = pd.DataFrame(synthetic_data, columns=data.columns)

# Save synthetic data to CSV
save_to_file = 'synthetic/manual_ganbc.csv'
synthetic_data_df.to_csv(save_to_file, index=False)
print(f"Synthetic data generated and saved to {save_to_file}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Synthetic data generated and saved to synthetic/manual_ganbc.csv
