In [1]:
# generate_dataset.py
# Generates a synthetic multivariate time-series CSV: lstm_timeseries_dataset.csv
# Usage: python generate_dataset.py

import numpy as np
import pandas as pd
from numpy.random import default_rng

def generate_csv(out_path='lstm_timeseries_dataset.csv', n=5000, seed=42):
    rng = default_rng(seed)
    date_index = pd.date_range(start="2000-01-01", periods=n, freq="D")

    # Non-stationary + seasonal signals
    trend = np.linspace(0, 500, n)                         # slow linear trend
    seasonality = 40 * np.sin(2 * np.pi * np.arange(n) / 365)  # annual seasonality
    noise = rng.normal(0, 5, n)

    # Three correlated economic indicators with different mixtures
    indicator_1 = 100 + trend + seasonality + noise
    indicator_2 = 80 + 0.5 * trend + 0.3 * seasonality + rng.normal(0, 4, n)
    indicator_3 = 60 + 0.2 * trend + 0.8 * seasonality + rng.normal(0, 3, n)

    df = pd.DataFrame({
        "date": date_index,
        "economic_indicator_1": indicator_1,
        "economic_indicator_2": indicator_2,
        "economic_indicator_3": indicator_3
    })

    df.to_csv(out_path, index=False)
    print(f"Saved {len(df)} rows to: {out_path}")

if __name__ == '__main__':
    generate_csv()


Saved dataset with 5000 rows to lstm_timeseries_dataset.csv


  df = df.fillna(method='bfill').reset_index(drop=True)
