In [16]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


1️⃣ [FOLDS] Cross-Validation in Time Series

Starting from this single Time Series:
- We will create FOLDS
-Train/Evaluate our LSTM  on each of these different FOLDS to conclude about the robustness of the model.
(It is very common to create hundreds of folds in Time Series forecasting, in order to cover all types of external conditions: crash market periods, bull markets, atone markets, etc...)


In [17]:
# Load the cleaned and merged dataset
data_path = '../raw_data/cleaned_merge_df_top10.csv'  
data = pd.read_csv(data_path)
data['date'] = pd.to_datetime(data['date'])  # Ensure 'date' is a datetime object
data.set_index('date', inplace=True)  # Set 'date' as index

In [18]:
# Define the number of splits
n_splits = 10  # we can increase this depending on the length and granularity of your data

# Create time series cross-validator
tscv = TimeSeriesSplit(n_splits=n_splits)

# Generate the indices to split data into training and test set
for train_index, test_index in tscv.split(data):
    train, test = data.iloc[train_index], data.iloc[test_index]
    print(f"Train shape: {train.shape}, Test shape: {test.shape}")
    # Here, we could fit our LSTM model on 'train' and evaluate it on 'test'


Train shape: (1740, 17), Test shape: (1739, 17)
Train shape: (3479, 17), Test shape: (1739, 17)
Train shape: (5218, 17), Test shape: (1739, 17)
Train shape: (6957, 17), Test shape: (1739, 17)
Train shape: (8696, 17), Test shape: (1739, 17)
Train shape: (10435, 17), Test shape: (1739, 17)
Train shape: (12174, 17), Test shape: (1739, 17)
Train shape: (13913, 17), Test shape: (1739, 17)
Train shape: (15652, 17), Test shape: (1739, 17)
Train shape: (17391, 17), Test shape: (1739, 17)


In [21]:
# Function to create and train the LSTM model
def train_lstm(train_data, test_data):
    # 'sales' is the column to predict
    X_train, y_train = train_data.drop('sales', axis=1), train_data['sales']
    X_test, y_test = test_data.drop('sales', axis=1), test_data['sales']
    
    # Reshape input to be [samples, time steps, features] which is required for LSTM
    X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
    
    # Build LSTM Model
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    # Fit model
    model.fit(X_train, y_train, epochs=50, batch_size=72, validation_data=(X_test, y_test), verbose=2, shuffle=False)
    
    # Evaluate the model
    test_predict = model.predict(X_test)
    # Compute error metrics, e.g., RMSE
    return model, test_predict

        # Evaluate the model on the test set
    test_loss = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Loss for the fold: {test_loss}')

# Apply this function in your cross-validation loop


2️⃣ [TRAIN-TEST SPLIT] Holdout method

For each FOLD, we will do a TRAIN-TEST SPLIT to:
fit the model on the train set
evaluate it on the test set
(Always split the train set **chronologically** before the test set!)

In [23]:
# Updated function to prepare data for LSTM
def prepare_data(data):
    # Assuming 'sales' is the column to predict and others are features
    X = data.drop('sales', axis=1)
    y = data['sales']
    X = X.reshape((X.shape[0], 1, X.shape[1]))
    return X, y

# Updated function to create the LSTM model
def build_lstm(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))         # Define input shape with Input layer
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Loop through each fold (assuming tscv and data are defined)
for train_index, test_index in tscv.split(data):
    # Split data into train and test sets
    train, test = data.iloc[train_index], data.iloc[test_index]
    
    # Train-test split inside the train data for validation
    train_size = int(len(train) * 0.80)  # Adjusted comment to match 80%
    train_data, validate_data = train[:train_size], train[train_size:]
    
    # Prepare data for LSTM
    X_train, y_train = prepare_data(train_data)
    X_validate, y_validate = prepare_data(validate_data)
    X_test, y_test = prepare_data(test)
    
    # Build and fit the LSTM model
    model = build_lstm((1, X_train.shape[2]))  # Updated to pass the correct shape
    model.fit(X_train, y_train, epochs=10, batch_size=72, validation_data=(X_validate, y_validate), verbose=2, shuffle=False)
    
    # Evaluate the model on the test set
    test_loss = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Loss for the fold: {test_loss}')


AttributeError: 'DataFrame' object has no attribute 'reshape'

3️⃣ [SEQUENCES] Sampling/Extracting sequences
After splitting each fold into a train set and a test set, it is time to:
- 🏋 sample lots of sequences on which the model will be trained
- 👩🏻‍🏫 sample lots of sequences on which the model will be evaluated

👉 All these sequences in the train set and the test set will have a common shape `(input_length, n_features)`
👉 Each sequence has a target, the shape of which will be `(output_length, n_targets)` 

In [24]:
def sample_sequences(data, input_length, output_length):
    """
    Samples sequences for LSTM training and evaluation.

    Args:
    data (DataFrame): The dataset containing features and target.
    input_length (int): The number of timesteps in each input sequence.
    output_length (int): The number of timesteps in each output sequence.

    Returns:
    X, y (np.array): Arrays of input sequences and corresponding target sequences.
    """
    X, y = [], []
    for i in range(len(data) - input_length - output_length + 1):
        # Extract input sequence and corresponding targets
        X.append(data.iloc[i:(i + input_length)].values)
        y.append(data.iloc[(i + input_length):(i + input_length + output_length)]['sales'].values)
    return np.array(X), np.array(y)


In [25]:
# Define parameters
input_length = 112  # Number of days in each input sequence
output_length = 28  # Number of days to predict

# Initialize the TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(data):
    # Split data into train and test sets
    train, test = data.iloc[train_index], data.iloc[test_index]
    
    # Sample sequences from train and test data
    X_train, y_train = sample_sequences(train, input_length, output_length)
    X_test, y_test = sample_sequences(test, input_length, output_length)
    
    # Assuming the function to build and train the model is already defined
    # Build the LSTM model and fit it
    model = build_lstm((X_train.shape[1], X_train.shape[2]))  # Ensure this matches the model definition
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=2)
    
    # Evaluate the model on the test set
    test_loss = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Loss for the fold: {test_loss}')


NameError: name 'Input' is not defined