## Data Loading

Load original dataset and preprocess the loaded data.

- data_name: stock, energy, or sine
- seq_len: sequence length of the time-series data

In [None]:
from data_loading import real_data_loading
from data_loading import real_data_processing
from data_loading import label_data

In [None]:
# --- Data loading ---
data_name = "AAPL"
seq_len = 60
start_date = "2005-01-01"
end_date = "2024-11-10"

ori_data = real_data_loading(data_name, start_date, end_date)

print(f"{data_name} dataset is ready. Number of sequences: {len(ori_data)}")

In [None]:
print(ori_data.head())

all_names = ori_data.columns.get_level_values(0)
feature_names= all_names.tolist()
print(feature_names)

In [None]:
# --- Data Processing ---
ori_data_x = real_data_processing(ori_data, seq_len)
print(len(ori_data_x))
print(ori_data_x[0].shape)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#########################################################################
# --- MAIN EXECUTION ---
#########################################################################

# Your feature names in the correct order
feature_names = [
    'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume',
    'Log_Return', 'ATR', 'BBW', 'MACD', 'MACD_Signal', 'RSI'
]

# Run the labeling function
# 'ori_data' is the list you created in your previous code
ori_data_s, metrics_df = label_data(ori_data_x, feature_names)

# --- Check the results ---
if ori_data_s:
    print(f"\nExample 'ori_data' item shape: {ori_data_x[0].shape}")
    print(f"Example 'ori_data_s' item: {ori_data_s[0]}")

    print(f"\nExample 'ori_data' item (another): {ori_data_x[-1].shape}")
    print(f"Example 'ori_data_s' item (another): {ori_data_s[-1]}")

    print(f"\nTotal length of 'ori_data': {len(ori_data_x)}")
    print(f"Total length of 'ori_data_s': {len(ori_data_s)}")

    # --- Optional Visualization ---
    # This plot helps you confirm the labels make sense
    print("\nGenerating visualization of labeled clusters...")
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=metrics_df,
        x='volatility',
        y='mdd',
        hue='label',
        palette={'Normal': 'g', 'Volatile': 'b', 'Crisis': 'r'},
        alpha=0.7,
        s=30
    )
    plt.title('Market Regime Clusters (Labeled)', fontsize=16)
    plt.xlabel('Volatility (Std. Dev. of Log Returns)', fontsize=12)
    plt.ylabel('Maximum Drawdown (MDD)', fontsize=12)
    plt.legend(title='Regime')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.savefig("market_regime_clusters.png")
    print("Saved cluster visualization to 'market_regime_clusters.png'")

In [1]:
## Necessary packages
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 1. TimeGAN model
from timegan import timegan
# 3. Metrics
from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.predictive_metrics import predictive_score_metrics
from metrics.visualization_metrics import visualization

2025-11-11 11:29:04.652949: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 11:29:05.099808: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-11 11:29:08.510265: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


## Set network parameters

TimeGAN network parameters should be optimized for different datasets.

- module: gru, lstm, or lstmLN
- hidden_dim: hidden dimensions
- num_layer: number of layers
- iteration: number of training iterations
- batch_size: the number of samples in each batch

In [None]:
## Newtork parameters
parameters = dict()

parameters['module'] = 'gru'
parameters['hidden_dim'] = 24
parameters['num_layer'] = 3
parameters['iterations'] = 10000
parameters['batch_size'] = 64

## Run TimeGAN for synthetic time-series data generation

TimeGAN uses the original data and network parameters to return the generated synthetic data.

In [None]:
# Run TimeGAN
generated_data_s, generated_data_x = timegan(ori_data_s, ori_data_x, parameters)
print('Finish Synthetic Data Generation')

In [None]:
normal_idx = []
crisis_idx = []
vol_idx = []
for i in range(len(generated_data_s)):
    if np.argmax(generated_data_s[i]) == 0:
        normal_idx.append(i)
    elif np.argmax(generated_data_s[i]) == 1:
        crisis_idx.append(i)
    else:
        vol_idx.append(i)

generated_data_x_n = []
for i in range(len(normal_idx)):
    generated_data_x_n.append(generated_data_x[normal_idx[i]])
generated_data_x_c = []
for i in range(len(crisis_idx)):
    generated_data_x_c.append(generated_data_x[crisis_idx[i]])
generated_data_x_v = []
for i in range(len(vol_idx)):
    generated_data_x_v.append(generated_data_x[vol_idx[i]])

nor_idx = []
cri_idx = []
vo_idx = []
for i in range(len(ori_data_s)):
    if ori_data_s[i] == [1.0,0.0,0.0]:
        nor_idx.append(i)
    elif ori_data_s[i] == [0.0,1.0,0.0]:
        cri_idx.append(i)
    else:
        vo_idx.append(i)

ori_data_x_n = []
for i in range(len(nor_idx)):
    ori_data_x_n.append(ori_data_x[nor_idx[i]])
ori_data_x_c = []
for i in range(len(cri_idx)):
    ori_data_x_c.append(ori_data_x[cri_idx[i]])
ori_data_x_v = []
for i in range(len(vo_idx)):
    ori_data_x_v.append(ori_data_x[vo_idx[i]])

print(len(generated_data_x_n))
print(len(ori_data_x_n))
print("/\n")
print(len(generated_data_x_c))
print(len(ori_data_x_c))
print("/\n")
print(len(generated_data_x_v))
print(len(ori_data_x_v))
print("/\n")


# print(ori_data_s[0])


In [None]:
import pickle
with open('generated_data_x_n.pkl', 'wb') as f:
    pickle.dump(generated_data_x_n, f)
with open('generated_data_x_c.pkl', 'wb') as f:
    pickle.dump(generated_data_x_c, f)
with open('generated_data_x_v.pkl', 'wb') as f:
    pickle.dump(generated_data_x_v, f)
with open('ori_data_x_n.pkl', 'wb') as f:
    pickle.dump(ori_data_x_n, f)
with open('ori_data_x_c.pkl', 'wb') as f:
    pickle.dump(ori_data_x_c, f)
with open('ori_data_x_v.pkl', 'wb') as f:
    pickle.dump(ori_data_x_v, f)

In [3]:
import pickle
with open('generated_data_x_n.pkl', 'rb') as f:
    generated_data_x_loaded_n = pickle.load(f)
with open('generated_data_x_c.pkl', 'rb') as f:
    generated_data_x_loaded_c = pickle.load(f)
with open('generated_data_x_v.pkl', 'rb') as f:
    generated_data_x_loaded_v = pickle.load(f)
with open('ori_data_x_n.pkl', 'rb') as f:
    ori_data_x_loaded_n = pickle.load(f)
with open('ori_data_x_c.pkl', 'rb') as f:
    ori_data_x_loaded_c = pickle.load(f)
with open('ori_data_x_v.pkl', 'rb') as f:
    ori_data_x_loaded_v = pickle.load(f)

# Evaluate the generated data

## Normal

In [4]:
len_ori_data_n = len(ori_data_x_loaded_n)
len_gen_data_n = len(generated_data_x_loaded_n)

### 1. Discriminative score

In [5]:
metric_iteration = 5

discriminative_score = list()
for _ in range(metric_iteration):
  if len_ori_data_n > len_gen_data_n:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_n[:len_gen_data_n], generated_data_x_loaded_n)
  else:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_n, generated_data_x_loaded_n[:len_ori_data_n])
  discriminative_score.append(temp_disc)

print('Discriminative score: ' + str(np.round(np.mean(discriminative_score), 4)))

2025-11-11 11:29:58.538114: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


TypeError: batch_generator() missing 1 required positional argument: 'batch_size'

### 2. Predictive score

In [None]:
predictive_score = list()
for _ in range(metric_iteration):
  if len_ori_data_n > len_gen_data_n:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_n[:len_gen_data_n], generated_data_x_loaded_n)
  else:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_n, generated_data_x_loaded_n[:len_ori_data_n])
  predictive_score.append(temp_disc)

print('predictive score: ' + str(np.round(np.mean(predictive_score), 4)))

### Visualisation

In [None]:
if len_ori_data_n < len_gen_data_n:
    visualization(ori_data_x_loaded_n, generated_data_x_loaded_n[:len(ori_data_x_loaded_n)], 'pca')
    visualization(ori_data_x_loaded_n, generated_data_x_loaded_n[:len(ori_data_x_loaded_n)], 'tsne')
else:
    visualization(ori_data_x_loaded_n[:len(generated_data_x_loaded_n)], generated_data_x_loaded_n, 'pca')
    visualization(ori_data_x_loaded_n[:len(generated_data_x_loaded_n)], generated_data_x_loaded_n, 'tsne')

## Crisis

In [None]:
len_ori_data_c = len(ori_data_x_loaded_c)
len_gen_data_c = len(generated_data_x_loaded_c)

### 1. Discriminative score

In [None]:
discriminative_score = list()
for _ in range(metric_iteration):
  if len_ori_data_c > len_gen_data_c:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_c[:len_gen_data_c], generated_data_x_loaded_c)
  else:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_c, generated_data_x_loaded_c[:len_ori_data_c])
  discriminative_score.append(temp_disc)

print('Discriminative score: ' + str(np.round(np.mean(discriminative_score), 4)))

### 2. Predictive score

In [None]:
predictive_score = list()
for _ in range(metric_iteration):
  if len_ori_data_c > len_gen_data_c:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_c[:len_gen_data_c], generated_data_x_loaded_c)
  else:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_c, generated_data_x_loaded_c[:len_ori_data_c])
  predictive_score.append(temp_disc)

print('predictive score: ' + str(np.round(np.mean(predictive_score), 4)))

### 3. Visualisation

if len_ori_data_c < len_gen_data_c:
    visualization(ori_data_x_loaded_c, generated_data_x_loaded_c[:len(ori_data_x_loaded_c)], 'pca')
    visualization(ori_data_x_loaded_c, generated_data_x_loaded_c[:len(ori_data_x_loaded_c)], 'tsne')
else:
    visualization(ori_data_x_loaded_c[:len(generated_data_x_loaded_c)], generated_data_x_loaded_c, 'pca')
    visualization(ori_data_x_loaded_c[:len(generated_data_x_loaded_c)], generated_data_x_loaded_c, 'tsne')

## Volatile

In [None]:
len_ori_data_v = len(ori_data_x_loaded_v)
len_gen_data_v = len(generated_data_x_loaded_v)

### 1. Discriminative Score

In [None]:
discriminative_score = list()
for _ in range(metric_iteration):
  if len_ori_data_v > len_gen_data_v:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_v[:len_gen_data_v], generated_data_x_loaded_v)
  else:
    temp_disc = discriminative_score_metrics(ori_data_x_loaded_v, generated_data_x_loaded_v[:len_ori_data_v])
  discriminative_score.append(temp_disc)

print('Discriminative score: ' + str(np.round(np.mean(discriminative_score), 4)))

### 2. Predictive Score

In [None]:
predictive_score = list()
for _ in range(metric_iteration):
  if len_ori_data_v > len_gen_data_v:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_v[:len_gen_data_v], generated_data_x_loaded_v)
  else:
    temp_disc = predictive_score_metrics(ori_data_x_loaded_v, generated_data_x_loaded_v[:len_ori_data_v])
  predictive_score.append(temp_disc)

print('predictive score: ' + str(np.round(np.mean(predictive_score), 4)))

### 3. Visualisation

In [None]:
if len_ori_data_v < len_gen_data_v:
    visualization(ori_data_x_loaded_v, generated_data_x_loaded_v[:len(ori_data_x_loaded_v)], 'pca')
    visualization(ori_data_x_loaded_v, generated_data_x_loaded_v[:len(ori_data_x_loaded_v)], 'tsne')
else:
    visualization(ori_data_x_loaded_v[:len(generated_data_x_loaded_v)], generated_data_x_loaded_v, 'pca')
    visualization(ori_data_x_loaded_v[:len(generated_data_x_loaded_v)], generated_data_x_loaded_v, 'tsne')