In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import os

# Check Python version
print(f"Python Version: {sys.version}")

# Check TensorFlow version
print(f"TensorFlow Version: {tf.__version__}")

# Check if TensorFlow can access the GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"Number of GPUs detected: {len(gpus)}")
    for gpu in gpus:
        print(f"GPU: {gpu}")
else:
    print("No GPU detected. TensorFlow is using the CPU.")

# Check CUDA and cuDNN versions
cuda_version = tf.sysconfig.get_build_info().get("cuda_version", "Not found")
cudnn_version = tf.sysconfig.get_build_info().get("cudnn_version", "Not found")
print(f"CUDA Version: {cuda_version}")
print(f"cuDNN Version: {cudnn_version}")

# Check if Pandas is installed and its version
try:
    print(f"Pandas Version: {pd.__version__}")
except ImportError:
    print("Pandas is not installed.")

# Check if NumPy is installed and its version
try:
    print(f"NumPy Version: {np.__version__}")
except ImportError:
    print("NumPy is not installed.")

# Check TensorFlow Privacy 
try:
    from tensorflow_privacy import DPKerasAdamOptimizer
    print("TensorFlow Privacy is installed.")
except ImportError:
    print("TensorFlow Privacy is not installed.")


2024-09-09 22:49:19.412333: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-09 22:49:19.412394: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-09 22:49:19.412449: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-09 22:49:19.435846: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Python Version: 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]
TensorFlow Version: 2.14.0
Number of GPUs detected: 8
GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:5', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:6', device_type='GPU')
GPU: PhysicalDevice(name='/physical_device:GPU:7', device_type='GPU')
CUDA Version: 11.8
cuDNN Version: 8
Pandas Version: 2.2.2
NumPy Version: 1.26.4
TensorFlow Privacy is installed.


In [2]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Thu_Nov_18_09:45:30_PST_2021
Cuda compilation tools, release 11.5, V11.5.119
Build cuda_11.5.r11.5/compiler.30672275_0
Mon Sep  9 22:49:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     Off | 00000000:1A:00.0 Off |                  N/A |
| 27%   27C    P8               9W / 250W |   9965MiB / 11264MiB |      0%      Default |
|                                      

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  8


In [4]:
from tensorflow_privacy import DPKerasAdamOptimizer


In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time

# Load the real data
root_path = '/srv/fs/my-notebooks'
file_path = os.path.join(root_path, 'user_skill_feature_mapping.csv')
real_data = pd.read_csv(file_path)

# Define the features used during GAN training
aggregated_features = ['cutDirDeviation_mean', 'cutDirDeviation_std', 
                       'cutDistanceToCenter_mean', 'cutDistanceToCenter_std', 
                       'saberSpeed_mean', 'saberSpeed_std', 
                       'cutAngle_mean', 'cutAngle_std']
X = real_data[aggregated_features].values.astype(np.float32)

# WGAN-GP Parameters
BUFFER_SIZE = X.shape[0]
BATCH_SIZE = 64  # Reduced batch size
LATENT_DIM = 128
EPOCHS = 500
CRITIC_ITERATIONS = 5
GP_WEIGHT = 10.0
LEARNING_RATE = 1e-4

# Generator Model
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_dim=LATENT_DIM),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.LeakyReLU(alpha=0.2),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.LeakyReLU(alpha=0.2),
        tf.keras.layers.Dense(X.shape[1], activation='linear')
    ])
    return model

# Critic Model (Discriminator equivalent in WGAN)
def build_critic():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, input_shape=(X.shape[1],)),
        tf.keras.layers.LeakyReLU(alpha=0.2),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128),
        tf.keras.layers.LeakyReLU(alpha=0.2),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1)  # No activation for WGAN
    ])
    return model

# Gradient Penalty Calculation with matched batch sizes
def gradient_penalty(critic, real_data, fake_data):
    # Ensure batch sizes are equal by resizing if necessary
    batch_size = min(real_data.shape[0], fake_data.shape[0])
    real_data = real_data[:batch_size]
    fake_data = fake_data[:batch_size]
    
    real_data = tf.cast(real_data, tf.float32)
    fake_data = tf.cast(fake_data, tf.float32)
    
    epsilon = tf.random.uniform([batch_size, 1], 0.0, 1.0)
    interpolated = epsilon * real_data + (1 - epsilon) * fake_data
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        pred = critic(interpolated)
    grads = tape.gradient(pred, interpolated)
    grads_l2 = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1]))
    gradient_penalty = tf.reduce_mean((grads_l2 - 1.0) ** 2)
    return gradient_penalty

# Loss functions
def critic_loss(real_output, fake_output, gp):
    return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output) + GP_WEIGHT * gp

def generator_loss(fake_output):
    return -tf.reduce_mean(fake_output)

# Optimizers
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.5, beta_2=0.9)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.5, beta_2=0.9)

# Custom training step for WGAN-GP
@tf.function
def distributed_train_step(real_data):
    def train_step(real_data):
        noise = tf.random.normal([BATCH_SIZE, LATENT_DIM])

        # Train Critic
        for _ in range(CRITIC_ITERATIONS):
            with tf.GradientTape() as crit_tape:
                fake_data = generator(noise)
                real_output = critic(real_data)
                fake_output = critic(fake_data)
                gp = gradient_penalty(critic, real_data, fake_data)
                crit_loss = critic_loss(real_output, fake_output, gp)
            
            critic_gradients = crit_tape.gradient(crit_loss, critic.trainable_variables)
            critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))

        # Train Generator
        with tf.GradientTape() as gen_tape:
            fake_data = generator(noise)
            fake_output = critic(fake_data)
            gen_loss = generator_loss(fake_output)

        generator_gradients = gen_tape.gradient(gen_loss, generator.trainable_variables)
        generator_optimizer.apply_gradients(zip(generator_gradients, generator.trainable_variables))

        return gen_loss, crit_loss

    # Run the step using the single GPU strategy
    per_replica_gen_loss, per_replica_crit_loss = strategy.run(train_step, args=(real_data,))
    
    # Aggregate losses across replicas
    total_gen_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_gen_loss, axis=None)
    total_crit_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_crit_loss, axis=None)

    return total_gen_loss, total_crit_loss

# Training loop for WGAN-GP
def train_wgan_gp(generator, critic, dataset, epochs):
    start_time = time.time()
    for epoch in range(epochs):
        for real_data in dataset:
            gen_loss, crit_loss = distributed_train_step(real_data)

        print(f"Epoch {epoch + 1}/{epochs}, Generator Loss: {gen_loss:.4f}, Critic Loss: {crit_loss:.4f}")

    total_time = time.time() - start_time
    print(f"Training completed in {total_time // 60} minutes")

# Dataset preparation
def prepare_dataset():
    dataset = tf.data.Dataset.from_tensor_slices(X).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
    return dataset

# Main execution
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")  # Single GPU for stability
with strategy.scope():
    generator = build_generator()
    critic = build_critic()

    dataset = prepare_dataset()

    # Train the WGAN-GP
    train_wgan_gp(generator, critic, dataset, EPOCHS)


2024-09-09 22:49:47.642841: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 510 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1a:00.0, compute capability: 7.5
2024-09-09 22:49:47.644597: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 512 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1b:00.0, compute capability: 7.5
2024-09-09 22:49:47.646025: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 512 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1d:00.0, compute capability: 7.5
2024-09-09 22:49:47.647341: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 512 MB memory:  -> device: 3, name: NVIDIA GeForce RTX 208

Epoch 1/500, Generator Loss: -0.7078, Critic Loss: -0.9357
Epoch 2/500, Generator Loss: 1.3317, Critic Loss: -0.2023
Epoch 3/500, Generator Loss: 0.5531, Critic Loss: -0.8348
Epoch 4/500, Generator Loss: 0.6499, Critic Loss: -0.7195
Epoch 5/500, Generator Loss: 0.5593, Critic Loss: -0.6168
Epoch 6/500, Generator Loss: 0.4094, Critic Loss: -0.3883
Epoch 7/500, Generator Loss: 0.3663, Critic Loss: -0.4992
Epoch 8/500, Generator Loss: 0.4315, Critic Loss: -0.4226
Epoch 9/500, Generator Loss: 0.2106, Critic Loss: -0.3854
Epoch 10/500, Generator Loss: 0.1852, Critic Loss: -0.3380
Epoch 11/500, Generator Loss: 0.4944, Critic Loss: -0.3707
Epoch 12/500, Generator Loss: 0.1245, Critic Loss: -0.3192
Epoch 13/500, Generator Loss: 0.2529, Critic Loss: -0.1146
Epoch 14/500, Generator Loss: 0.3874, Critic Loss: -0.2754
Epoch 15/500, Generator Loss: 0.2980, Critic Loss: -0.3155
Epoch 16/500, Generator Loss: 0.1011, Critic Loss: -0.2233
Epoch 17/500, Generator Loss: 0.1482, Critic Loss: -0.2404
Epoch

In [7]:
# Generate synthetic data
def generate_synthetic_data(generator, num_samples):
    noise = tf.random.normal([num_samples, LATENT_DIM])
    synthetic_data = generator(noise)
    return synthetic_data.numpy()

# Example: Generate 1000 synthetic data points
num_samples = 1000
synthetic_data = generate_synthetic_data(generator, num_samples)

# Convert to DataFrame for easier analysis
synthetic_df = pd.DataFrame(synthetic_data, columns=aggregated_features)
print(synthetic_df.head())


   cutDirDeviation_mean  cutDirDeviation_std  cutDistanceToCenter_mean  \
0             -0.001712             0.999435                 -0.006841   
1             -0.003251             0.992640                 -0.005330   
2             -0.002491             0.984297                 -0.004461   
3             -0.001528             0.995437                 -0.006087   
4             -0.000257             1.004976                 -0.009085   

   cutDistanceToCenter_std  saberSpeed_mean  saberSpeed_std  cutAngle_mean  \
0                 1.017986         0.007647        1.015966       0.018947   
1                 1.008172         0.009244        1.007786       0.017057   
2                 1.000382         0.007055        1.001602       0.017351   
3                 1.014217         0.008096        1.011679       0.019237   
4                 1.025519         0.008893        1.019437       0.020453   

   cutAngle_std  
0      1.025492  
1      1.017061  
2      1.004769  
3      1.02405

In [8]:
# Real data statistics
real_stats = real_data[aggregated_features].describe()

# Synthetic data statistics
synthetic_stats = synthetic_df.describe()

print("Real Data Statistics:\n", real_stats)
print("\nSynthetic Data Statistics:\n", synthetic_stats)


Real Data Statistics:
        cutDirDeviation_mean  cutDirDeviation_std  cutDistanceToCenter_mean  \
count          2.598000e+03          2598.000000              2.598000e+03   
mean           1.424928e-06             1.000007              8.794410e-07   
std            8.784777e-05             0.000131              4.585554e-05   
min           -6.338356e-04             0.994173             -9.708187e-05   
25%           -9.605894e-18             1.000001             -8.821337e-18   
50%            1.725709e-18             1.000003              5.768350e-18   
75%            1.523262e-17             1.000009              2.327524e-17   
max            4.430973e-03             1.001131              2.335010e-03   

       cutDistanceToCenter_std  saberSpeed_mean  saberSpeed_std  \
count              2598.000000     2.598000e+03     2598.000000   
mean                  0.999143     1.009144e-07        0.999791   
std                   0.027839     1.056026e-05        0.008837   
min   

In [9]:
from scipy import stats

for feature in aggregated_features:
    real_values = real_data[feature].values
    synthetic_values = synthetic_df[feature].values
    
    ks_stat, p_value = stats.ks_2samp(real_values, synthetic_values)
    print(f"KS Test for {feature}: Stat = {ks_stat}, P-value = {p_value}")


KS Test for cutDirDeviation_mean: Stat = 0.9342301770592764, P-value = 0.0
KS Test for cutDirDeviation_std: Stat = 0.7228452655889146, P-value = 1.18e-321
KS Test for cutDistanceToCenter_mean: Stat = 0.998, P-value = 0.0
KS Test for cutDistanceToCenter_std: Stat = 0.9888452655889145, P-value = 0.0
KS Test for saberSpeed_mean: Stat = 1.0, P-value = 0.0
KS Test for saberSpeed_std: Stat = 0.9948452655889145, P-value = 0.0
KS Test for cutAngle_mean: Stat = 1.0, P-value = 0.0
KS Test for cutAngle_std: Stat = 0.9996150885296382, P-value = 0.0


In [10]:
# Save synthetic data to CSV
synthetic_df.to_csv('/srv/fs/my-notebooks/synthetic_data.csv', index=False)
