## Data Loading

Load original dataset and preprocess the loaded data.

- data_name: stock, energy, or sine
- seq_len: sequence length of the time-series data

In [None]:
from data_loading import real_data_loading
from data_loading import real_data_processing
from data_loading import label_data

In [None]:
# --- Data loading ---
data_name = "AAPL"
seq_len = 60
start_date = "2005-01-01"
end_date = "2024-11-10"

ori_data = real_data_loading(data_name, start_date, end_date)

print(f"{data_name} dataset is ready. Number of sequences: {len(ori_data)}")

In [None]:
print(ori_data.head())

all_names = ori_data.columns.get_level_values(0)
feature_names= all_names.tolist()
print(feature_names)

In [None]:
# --- Data Processing ---
ori_data_x = real_data_processing(ori_data, seq_len)
print(len(ori_data_x))
print(ori_data_x[0].shape)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#########################################################################
# --- MAIN EXECUTION ---
#########################################################################

# Your feature names in the correct order
feature_names = [
    'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume',
    'Log_Return', 'ATR', 'BBW', 'MACD', 'MACD_Signal', 'RSI'
]

# Run the labeling function
# 'ori_data' is the list you created in your previous code
ori_data_s, metrics_df = label_data(ori_data_x, feature_names)

# --- Check the results ---
if ori_data_s:
    print(f"\nExample 'ori_data' item shape: {ori_data_x[0].shape}")
    print(f"Example 'ori_data_s' item: {ori_data_s[0]}")

    print(f"\nExample 'ori_data' item (another): {ori_data_x[-1].shape}")
    print(f"Example 'ori_data_s' item (another): {ori_data_s[-1]}")

    print(f"\nTotal length of 'ori_data': {len(ori_data_x)}")
    print(f"Total length of 'ori_data_s': {len(ori_data_s)}")

    # --- Optional Visualization ---
    # This plot helps you confirm the labels make sense
    print("\nGenerating visualization of labeled clusters...")
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=metrics_df,
        x='volatility',
        y='mdd',
        hue='label',
        palette={'Normal': 'g', 'Volatile': 'b', 'Crisis': 'r'},
        alpha=0.7,
        s=30
    )
    plt.title('Market Regime Clusters (Labeled)', fontsize=16)
    plt.xlabel('Volatility (Std. Dev. of Log Returns)', fontsize=12)
    plt.ylabel('Maximum Drawdown (MDD)', fontsize=12)
    plt.legend(title='Regime')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.savefig("market_regime_clusters.png")
    print("Saved cluster visualization to 'market_regime_clusters.png'")

In [None]:
## Necessary packages
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 1. TimeGAN model
from timegan import timegan
# 3. Metrics
from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.predictive_metrics import predictive_score_metrics
from metrics.visualization_metrics import visualization

## Set network parameters

TimeGAN network parameters should be optimized for different datasets.

- module: gru, lstm, or lstmLN
- hidden_dim: hidden dimensions
- num_layer: number of layers
- iteration: number of training iterations
- batch_size: the number of samples in each batch

In [None]:
## Newtork parameters
parameters = dict()

parameters['module'] = 'gru'
parameters['hidden_dim'] = 24
parameters['num_layer'] = 3
parameters['iterations'] = 22
parameters['batch_size'] = 128

## Run TimeGAN for synthetic time-series data generation

TimeGAN uses the original data and network parameters to return the generated synthetic data.

In [None]:
# Run TimeGAN
generated_data_s, generated_data_x = timegan(ori_data_s, ori_data_x, parameters)
print('Finish Synthetic Data Generation')

In [None]:
for i in range(len(generated_data_s)):
    print(generated_data_s[i])
print(len(generated_data_s))

## Evaluate the generated data

### 1. Discriminative score

To evaluate the classification accuracy between original and synthetic data using post-hoc RNN network. The output is |classification accuracy - 0.5|.

- metric_iteration: the number of iterations for metric computation.

In [None]:
metric_iteration = 5

discriminative_score = list()
for _ in range(metric_iteration):
  temp_disc = discriminative_score_metrics(ori_data_x, generated_data_x)
  discriminative_score.append(temp_disc)

print('Discriminative score: ' + str(np.round(np.mean(discriminative_score), 4)))

## Evaluate the generated data

### 2. Predictive score

To evaluate the prediction performance on train on synthetic, test on real setting. More specifically, we use Post-hoc RNN architecture to predict one-step ahead and report the performance in terms of MAE.

In [None]:
predictive_score = list()
for tt in range(metric_iteration):
  temp_pred = predictive_score_metrics(ori_data, generated_data_x)
  predictive_score.append(temp_pred)

print('Predictive score: ' + str(np.round(np.mean(predictive_score), 4)))

## Evaluate the generated data

### 3. Visualization

We visualize the original and synthetic data distributions using PCA and tSNE analysis.

In [None]:
visualization(ori_data_x, generated_data_x, 'pca')
visualization(ori_data_x, generated_data_x, 'tsne')