### Fixed condition 
- input : 'commodity_code', 'Year', 'volume', 'close'
- look_back(window size) : 8
- Layer composition and number of nodes
- dropout
- optimizer
- activation function

### param_grid
- batch size
- early stopping patience

In [1]:
import pandas as pd
import numpy as np
import random
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import ParameterGrid
from math import sqrt
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, Dropout, TimeDistributed
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

pd.options.mode.chained_assignment = None
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.float_format', '{:.2f}'.format)

### Read file
- The data to be worked on is downsampled from the raw data on a weekly basis.
- raw data name : Corn, Oat, Cereals & Grains Futures Data    
  (Historical data on Cereals and Grains Futures from Yahoo Finance)
- raw data source : kaggle datasets  
  (https://www.kaggle.com/datasets/guillemservera/grains-and-cereals-futures)

In [2]:
df = pd.read_csv('../data/grain_prices/all_grains_data_week_2.csv',
                sep=',', encoding='utf-8')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7194 entries, 0 to 7193
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   commodity       7194 non-null   object 
 1   week_last_date  7194 non-null   object 
 2   open            7194 non-null   float64
 3   high            7194 non-null   float64
 4   low             7194 non-null   float64
 5   close           7194 non-null   float64
 6   volume          7194 non-null   int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 393.5+ KB


In [3]:
# data sorting

df['week_last_date'] = pd.to_datetime(df['week_last_date'])
df.sort_values(by=['commodity', 'week_last_date'], inplace=True)

In [4]:
# Create year and month columns

df['Year'] = df['week_last_date'].dt.year

In [5]:
# commodity - Coded as an integer

df['commodity_code'] = df['commodity'].astype('category').cat.codes

mapping = dict(enumerate(df['commodity'].astype('category').cat.categories))
print(mapping)

{0: 'Corn', 1: 'KC HRW Wheat', 2: 'Oat', 3: 'Rough Rice', 4: 'Soybean', 5: 'Soybean Oil'}


In [6]:
# Select input variable

df1 = df[['week_last_date', 'commodity_code', 'Year', 'volume', 'close']]

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7194 entries, 0 to 7193
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   week_last_date  7194 non-null   datetime64[ns]
 1   commodity_code  7194 non-null   int8          
 2   Year            7194 non-null   int32         
 3   volume          7194 non-null   int64         
 4   close           7194 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), int8(1)
memory usage: 203.9 KB


In [7]:
# normalization

df2 = df1.copy()

columns_to_scale = ['Year', 'volume', 'close']
sub_df = df2[columns_to_scale]

scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(sub_df)

df2[columns_to_scale] = scaled_values

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7194 entries, 0 to 7193
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   week_last_date  7194 non-null   datetime64[ns]
 1   commodity_code  7194 non-null   int8          
 2   Year            7194 non-null   float64       
 3   volume          7194 non-null   float64       
 4   close           7194 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int8(1)
memory usage: 232.0 KB


In [8]:
# Separate test dataset

look_back = 8
forecast_horizon = 4

dates = df2['week_last_date'].unique()
sorted_dates = sorted(dates)

split_time = sorted_dates[-(look_back+forecast_horizon)]

train_data = df2[df2['week_last_date'] < split_time]
test_data = df2[df2['week_last_date'] >= split_time]

test_label = df1[df1['week_last_date'] >= split_time]
test_label = test_label[['commodity_code', 'week_last_date', 'close']]

train_data.drop(labels=['week_last_date'], axis=1, inplace=True)
test_data.drop(labels=['week_last_date'], axis=1, inplace=True)

In [9]:
# Function to convert input data format

def make_dataset(data, look_back, forecast_horizon):
    X, y = list(), list()
    for commodity_code, commodity_group in data.groupby('commodity_code'):
        
        for i in range(len(commodity_group)-look_back-forecast_horizon+1): 
            lag_end = i + look_back
            forecast_end = lag_end + forecast_horizon
            if forecast_end > len(commodity_group):
                break
            seq_x, seq_y = commodity_group[i:lag_end], commodity_group[lag_end:forecast_end]
            X.append(seq_x)
            y.append(seq_y)
            
    return np.array(X), np.array(y)

In [10]:
# train data - convert to input data format

trainX, trainY = make_dataset(train_data, look_back, forecast_horizon)
trainY = trainY[:, :, len(train_data.columns)-1:len(train_data.columns)]

# test data - convert to input data format

testX, testY = make_dataset(test_data, look_back, forecast_horizon)
testY = testY[:, :, len(train_data.columns)-1:len(train_data.columns)]

# test label - convert to input data format

testX_label, testY_label = make_dataset(test_label, look_back, forecast_horizon)

print(trainX.shape, trainY.shape)
print(testX.shape, testY.shape)
print(testX_label.shape, testY_label.shape)

(7056, 8, 4) (7056, 4, 1)
(6, 8, 4) (6, 4, 1)
(6, 8, 3) (6, 4, 3)


In [11]:
# hyperparameter grid settings

param_grid = {
    'batch_size': [8, 12, 16],
    'patience': [10, 20]
}

In [12]:
# model building and learning - grid search

best_params = {}
best_loss = float("inf")
best_mae = None

for params in ParameterGrid(param_grid):

    tf.random.set_seed(7)

    model = Sequential()
    
    model.add(LSTM(128, activation='LeakyReLU', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
    model.add(LSTM(64, activation='LeakyReLU'))
    model.add(RepeatVector(trainY.shape[1]))
    model.add(Dropout(0.1))
    model.add(LSTM(32, activation='LeakyReLU', return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    
    adam1 = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=adam1, loss='mse', metrics=['mae'])

    early_stop = EarlyStopping(monitor='val_loss', patience=params['patience'])
    hist = model.fit(trainX, trainY, epochs=100, 
                     batch_size=params['batch_size'], validation_split=0.1, callbacks=[early_stop])

    val_loss = min(hist.history['val_loss'])
    val_mae_at_best_loss = hist.history['val_mae'][hist.history['val_loss'].index(val_loss)]

    if val_loss < best_loss:
        best_loss = val_loss
        best_mae = val_mae_at_best_loss  
        best_params = params

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoc

In [13]:
print("Best params:", best_params)
print("Best val_loss:", best_loss)
print("val_mae at best val_loss:", best_mae)

Best params: {'batch_size': 8, 'patience': 20}
Best val_loss: 1.9058797988691367e-05
val_mae at best val_loss: 0.0037736922968178988
