In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_process import ArmaProcess

# Set the random seed for reproducibility
np.random.seed(42)

# Generate ARIMA(1, 1, 1) synthetic data for training
# AR lag polynomials (ARIMA has AR and MA components)
ar = np.array([1, -0.5])
ma = np.array([1, 0.25])
ARIMA_process = ArmaProcess(ar, ma)
y_train = ARIMA_process.generate_sample(nsample=100)

# Generate some GARCH-like volatility clustering for training
# Volatility clustering means high volatility days are likely to be followed by high volatility days
volatility_train = np.random.normal(1, 0.2, 100)
volatility_train[40:60] = np.random.normal(3, 0.2, 20)
y_train *= volatility_train

# Generate ARIMA(1, 1, 1) synthetic data for testing
y_test = ARIMA_process.generate_sample(nsample=10)

# Generate some GARCH-like volatility clustering for testing
volatility_test = np.random.normal(1, 0.2, 10)
volatility_test[3:7] = np.random.normal(3, 0.2, 4)
y_test *= volatility_test

# Put synthetic data into DataFrames for easier handling
df_train = pd.DataFrame(y_train, columns=['Log_Return'])
df_test = pd.DataFrame(y_test, columns=['Log_Return'])

df_train.head(), df_test.head()

(   Log_Return
 0    0.356107
 1    0.214562
 2    0.680204
 3    1.721134
 4    1.133851,
    Log_Return
 0    1.976549
 1   -0.136954
 2    0.155210
 3   -3.876501
 4   -4.635968)