# GAN

Dit ntoebook bevat de GAN die mbv optie data synthetische optie data maakt. Input is optie data, output is syntehtische data.

Bevat ook nog wat analyse van jesper.

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib import rc
import numpy as np
from IPython.display import display, clear_output
import seaborn as sns





In [2]:
# Give all the plots a seaborn style
sns.set(style="whitegrid")

# # Give all the plots LaTex font and text
# rc('text', usetex=True)
# rc('font', family='serif')

# Preprocess Data

In [3]:
# Read data
Gold = pd.read_excel(r"C:\Users\koens\OneDrive\Bureaublad\Research-Practicum\Data\Real Data\FUT_Option.xlsx")

# Convert to datetime
Gold['date'] = pd.to_datetime(Gold['date'])
Gold['futures_expiration_date'] = pd.to_datetime(Gold['futures_expiration_date'])
Gold['options_expiration_date'] = pd.to_datetime(Gold['options_expiration_date'], errors='coerce')

Gold = Gold[Gold['options_expiration_date'] >= '2019-10-18']
Gold['TTM'] = (Gold['options_expiration_date'] - Gold['date']).dt.days / 365.25

# Rescale
Gold['futures_close'] = Gold['futures_close']/1000000
Gold['strike'] = Gold['strike']/1000000
Gold['bid'] = Gold['bid']/1000000
Gold['ask'] = Gold['ask']/1000000
Gold['settlement'] = Gold['settlement']/1000000
Gold['vega'] = Gold['vega']/1000000

# Isolate call and put
Gold_call = Gold[Gold['call_put'] == 'C'].copy()
Gold_put = Gold[Gold['call_put'] == 'P'].copy()

# Sort by date
Gold_call.sort_values('date', inplace=True)
Gold_put.sort_values('date', inplace=True)

# Drop non-numeric columns if there are any
data = Gold_call.select_dtypes(include=[np.number])
data = data.drop(['delta', 'vega', 'gamma', 'theta'], axis=1)

# Filter out rows with IV > 1000
mask = data['iv'] > 1000
data = data[~mask]

# Normalize the data
scaler = MinMaxScaler(feature_range=(-1, 1))
data_scaled = scaler.fit_transform(data)


In [None]:
data

# Build the Market Data Simulator

### Parameters

In [4]:

# Set hyperparameters
BATCH_SIZE = 128
NOISE_DIM = 75
BUFFER_SIZE = 5389
EPOCHS = 7500
random_samples = 6000
NEURONS_GEN = 256
NEURONS_DISC = 64
DROPOUT = 0.3

# Define the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices(data_scaled).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Overige params
accuracy = 5
step_daily = 0.00273785078
step_weekly = 0.14285714285


### Define the GAN's networks

In [5]:

def make_generator_model(BATCH_SIZE, NOISE_DIM, NEURONS_GEN, data_scaled):
    '''Generates a network that generates synthetic data'''
    generator = tf.keras.Sequential([
        layers.Dense(BATCH_SIZE, activation='relu', input_shape=(NOISE_DIM,)),
        layers.BatchNormalization(),
        layers.Dense(NEURONS_GEN, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(data_scaled.shape[1], activation='tanh')
    ])
    return generator

def make_discriminator_model(BATCH_SIZE, NEURONS_DISC, DROPOUT, data_scaled):
    '''Generates a network that discriminates real from synthetic data'''
    discriminator = tf.keras.Sequential([
        layers.Dense(BATCH_SIZE, activation='relu', input_shape=(data_scaled.shape[1],)),
        layers.Dropout(DROPOUT),
        layers.Dense(NEURONS_DISC, activation='relu'),
        layers.Dropout(DROPOUT),
        layers.Dense(1, activation='sigmoid')
    ])
    return discriminator



In [8]:
# Instantiate the models
generator = make_generator_model(BATCH_SIZE, NOISE_DIM, NEURONS_GEN, data_scaled)
discriminator = make_discriminator_model(BATCH_SIZE, NEURONS_DISC, DROPOUT, data_scaled)

# Define the loss and optimizers
cross_entropy = tf.keras.losses.BinaryCrossentropy()
generator_optimizer = tf.keras.optimizers.Adam()
discriminator_optimizer = tf.keras.optimizers.Adam()

### Train the model

In [10]:
# Define the training loop
@tf.function
def train_step(real_data):
    # Sample noise from a normal distribution
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    # Train the discriminator
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_data = generator(noise, training=True)

        # Discriminate real from fake data
        real_output = discriminator(real_data, training=True)
        fake_output = discriminator(generated_data, training=True)

        # Calculate the losses
        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss = (cross_entropy(tf.ones_like(real_output), real_output) +
                     cross_entropy(tf.zeros_like(fake_output), fake_output))

    # Calculate the gradients
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    # Update the weights
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

# Training loop
def train(dataset, epochs):
    '''Trains the GAN'''
    for epoch in range(epochs):
        for data_batch in dataset:
            train_step(data_batch)

        # Display progress
        clear_output(wait=True)
        display(f'Epoch {epoch + 1}/{epochs} completed')

# Initialize the training
train(train_dataset, EPOCHS)

'Epoch 20/7500 completed'

KeyboardInterrupt: 

### Get the synthetic data

In [None]:
# Generate random noise
random_noise = tf.random.normal([random_samples, NOISE_DIM])

# Use the generator to create option prices
simulated_data = generator(random_noise, training=False)

# Rescale the data
simulated_data_rescaled = scaler.inverse_transform(simulated_data)

# Convert the generated data to pandas df
simulated_data_df = pd.DataFrame(simulated_data_rescaled, columns=data.columns)

# Save synthetic data to drive
simulated_data_df.to_excel(r"C:\Users\koens\OneDrive\Bureaublad\Research-Practicum\Data\Synthetic Data\synthetic_gold_lowsamples_7500.xlsx", index=False)

### Hyperparameters meaning

BATCH_SIZE (256): Number of training examples used in one iteration of model training. In a dataset, data is usually divided into batches, and each batch is fed into the network one at a time. A batch size of 256 means that 256 data points from the dataset are used for each training step.

NOISE_DIM (100): Refers to the dimensionality of the random noise vector that is input into the generator network. A noise dimension of 100 means that the generator takes in a random vector of size 100 to generate data.

BUFFER_SIZE (60000): Defines the size of the buffer used to shuffle the dataset. A larger buffer size ensures better randomization of data. In TensorFlow, for example, Dataset.shuffle(buffer_size) randomly shuffles the elements of the dataset. A buffer size of 60000 in this context would mean the dataset is shuffled using a buffer that can hold 60000 elements.

# Analyze the Synthethic Data

In [None]:
simulated_data_df.tail(50)

### Discretize the Time to Maturity (TTM)

Replaces each TTM value in the generated data with the nearest TTM value from the original dataset.

In [None]:
from scipy.spatial import cKDTree

# Quantize TTM values to the nearest multiple of the step
# simulated_data_df['TTM'] = (np.round(simulated_data_df['TTM'] / step_daily) * step_daily).astype(float)

# Convert the original TTM values into a KDTree for efficient nearest neighbor search
# original_TTM_tree = cKDTree(np.array(data['TTM']).reshape(-1, 1))

# Replace generated TTM values with nearest original TTM values
# generated_TTM_values = simulated_data_df['TTM']
# nearest_indices = original_TTM_tree.query(generated_TTM_values.values.reshape(-1, 1))[1]
# simulated_data_df['TTM'] = np.array(data['TTM'])[nearest_indices]

# Convert the original future_close values into a KDTree for efficient nearest neighbor search
# original_TTM_tree = cKDTree(np.array(data['futures_close']).reshape(-1, 1))

# Replace generated future_close values with nearest original future_close values
# generated_TTM_values = simulated_data_df['futures_close']
# nearest_indices = original_TTM_tree.query(generated_TTM_values.values.reshape(-1, 1))[1]
# simulated_data_df['futures_close'] = np.array(data['futures_close'])[nearest_indices]

### Round the prices

In [None]:
simulated_data_df['strike'] = (simulated_data_df['strike'] / accuracy).round() * accuracy
simulated_data_df['Moneyness'] = simulated_data_df['strike']/simulated_data_df['futures_close']

# print amount of rows in simulated data
print('Amount of rows in simulated data: ' + str(len(simulated_data_df)))

In [None]:
data.describe()

In [None]:
simulated_data_df.describe()

### Plot features

In [None]:
plt.scatter(data['iv'], data['TTM'], alpha = 0.75, s=5, label = "Real")
plt.scatter(simulated_data_df['iv'], simulated_data_df['TTM'], alpha = 0.75, s=5, label = "GAN")
plt.title("Volatility distribution of real and synthetic data")
plt.xlabel("Implied volatility")
plt.ylabel("TTM")
plt.legend()

In [None]:
simulated_data_df

In [None]:
simulated_data_df['TTM'].value_counts()

In [None]:
unique_ttms = simulated_data_df['TTM'].unique()
target_ttms = np.arange(0, max(unique_ttms).round(), 0.5)
closest_ttms = np.array([unique_ttms[np.argmin(np.abs(unique_ttms - target))] for target in target_ttms])

In [None]:

# Plot 'strike' against 'iv' for each 'TTM'
for ttm in unique_ttms:
    sorted_data = simulated_data_df.sort_values(by='Moneyness')
    subset = simulated_data_df[simulated_data_df['TTM'] == ttm]
    plt.figure()
    plt.plot(sorted_data['Moneyness'], sorted_data['iv'], alpha = 0.75)
    plt.title(f'TTM = {ttm.round}')
    plt.xlabel('Moneyness')
    plt.ylabel('IV')

plt.tight_layout()
plt.show()

In [None]:

# Plot 'IV' against 'Moneyness' for each 'TTM'
for ttm in unique_ttms:
    # Create a subset of the data for each TTM
    subset = simulated_data_df[simulated_data_df['TTM'] == ttm]
    subset_sorted = subset.sort_values(by='Moneyness')

    # Plot the data
    plt.figure()
    plt.plot(subset_sorted['Moneyness'], subset_sorted['iv'], alpha = 0.75)
    plt.title(f'TTM = {ttm.round(4)}')
    plt.xlabel('Moneyness')
    plt.ylabel('IV')
    plt.grid(True)
    plt.show()

# Remove plt.tight_layout() as it is not needed when showing plots one by one

In [None]:
simulated_data_df['futures_close'].value_counts()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(simulated_data_df['strike'], bins=30, alpha=0.7)
plt.title('GAN')
plt.xlabel('Strike Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(data['strike'], bins=30, alpha=0.7)
plt.title('Real')
plt.xlabel('Strike Price')
plt.ylabel('Frequency')

fig.suptitle('Strike Price Distribution of Real and Synthetic data', fontsize=16)
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(simulated_data_df['TTM'], bins=60, alpha=0.7)
plt.title('GAN')
plt.xlabel('TTM')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(data['TTM'], bins=60, alpha=0.7)
plt.title('Real')
plt.xlabel('TTM')
plt.ylabel('Frequency')

fig.suptitle('Time to Maturity Distribution of Real and Synthetic Data', fontsize=16)
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(simulated_data_df['iv'], bins=30, alpha=0.7)
plt.title('GAN')
plt.xlabel('Implied Volatility')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(data['iv'], bins=30, alpha=0.7)
plt.title('Real')
plt.xlabel('Implied Volatility')
plt.ylabel('Frequency')

fig.suptitle('Implied Volatility Distribution of Synthetic and Real Option Data', fontsize=16)
plt.tight_layout()
plt.show()

unique_combinations = simulated_data_df[['TTM', 'futures_close']].drop_duplicates()
kurtosis = []
skewness = []
labels = []
for index, row in unique_combinations.iterrows():
    specific_expiration_date = row['TTM']
    specific_future_close = row['futures_close']
    
    # Filter data for the specific combination
    specific_conditions = (simulated_data_df['TTM'] == specific_expiration_date) & (simulated_data_df['futures_close'] == specific_future_close)
    filtered_data = simulated_data_df[specific_conditions]
    
    sorted_data = filtered_data.sort_values(by='Moneyness')
    kurtosis_value = filtered_data['iv'].kurtosis()
    if kurtosis_value <20:
        kurtosis.append(kurtosis_value)
        labels.append(f'{specific_expiration_date} - {specific_future_close}')
        skewness.append(filtered_data['iv'].skew())
    plt.figure()
    plt.plot(sorted_data['Moneyness'], sorted_data['iv'])
    plt.title(f'Implied Volatility vs Moneyness\nExpiration Date: {specific_expiration_date}, Future Close: {specific_future_close}')
    plt.xlabel('Moneyness')
    plt.ylabel('Implied Volatility')
    plt.grid(True)

In [None]:
# Fit a polynomial of degree n (e.g., n=3 for cubic)
n = 3

for ttm in unique_ttms:
    subset = simulated_data_df[simulated_data_df['TTM'] == ttm]
    subset_sorted = subset.sort_values(by='Moneyness')
    coefficients = np.polyfit(subset_sorted['Moneyness'], subset_sorted['iv'], n)

    # Create a polynomial function with the obtained coefficients
    polynomial = np.poly1d(coefficients)

    # Generate x values for plotting the polynomial curve
    x_values = np.linspace(subset_sorted['Moneyness'].min(), subset_sorted['Moneyness'].max(), 100)

    # Plot the original data and the fitted polynomial curve
    plt.figure()
    plt.scatter(subset_sorted['Moneyness'], subset_sorted['iv'], label='Original Data', s = 5)
    plt.plot(x_values, polynomial(x_values), color='red', label=f'{n}-degree Polynomial Fit')
    plt.title(f'Volatility Smile with Polynomial Fit of TTM = {ttm}')
    plt.xlabel('Moneyness')
    plt.ylabel('IV')
    plt.legend()
    plt.show()