In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [2]:
UTI_Gold_df=pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv',parse_dates=True,index_col='All_Date')
UTI_Gold_df.tail()

Unnamed: 0_level_0,MF_NAV,ETF_Price,ETF_Open,ETF_High,ETF_Low,ETF_Vol.,ETF_Change %,USD_Price,USD_Open,USD_High,USD_Low,USD_Change %,Gold_Open,Gold_High,Gold_Low,Gold_Close,Gold_Volume
All_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-12-24,14.7623,64.3,64.6,64.6,64.05,556.50K,-0.46%,85.176,85.1,85.218,85.075,0.10%,2613.0,2620.0,2609.5,2620.0,35.0
2024-12-26,14.8195,64.55,65.1,65.1,64.35,60.38K,0.39%,85.254,85.201,85.319,85.195,0.07%,2628.5,2638.800049,2627.899902,2638.800049,84.0
2024-12-27,14.8653,64.75,65.25,65.25,64.25,51.05K,0.31%,85.388,85.253,85.823,85.229,0.16%,2617.699951,2617.699951,2616.399902,2617.199951,642.0
2024-12-30,14.8192,64.55,64.95,67.0,64.45,106.25K,-0.31%,85.474,85.437,85.605,85.426,0.10%,2620.699951,2626.899902,2597.0,2606.100098,794.0
2024-12-31,14.8421,64.65,64.8,64.85,64.35,49.70K,0.15%,85.554,85.538,85.687,85.516,0.09%,2608.399902,2629.199951,2604.899902,2629.199951,401.0


In [3]:
UTI_Gold_df.index

DatetimeIndex(['2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05',
               '2023-01-06', '2023-01-09', '2023-01-10', '2023-01-11',
               '2023-01-12', '2023-01-13',
               ...
               '2024-12-17', '2024-12-18', '2024-12-19', '2024-12-20',
               '2024-12-23', '2024-12-24', '2024-12-26', '2024-12-27',
               '2024-12-30', '2024-12-31'],
              dtype='datetime64[ns]', name='All_Date', length=489, freq=None)

In [4]:
#Check for null values
UTI_Gold_df.isnull().sum()

MF_NAV           0
ETF_Price        0
ETF_Open         0
ETF_High         0
ETF_Low          0
ETF_Vol.         0
ETF_Change %     0
USD_Price        0
USD_Open         0
USD_High         0
USD_Low          0
USD_Change %     0
Gold_Open       16
Gold_High       16
Gold_Low        16
Gold_Close      16
Gold_Volume     16
dtype: int64

In [5]:
for column in UTI_Gold_df.columns:
    if UTI_Gold_df[column].isna().sum() > 0:
        UTI_Gold_df[column].bfill(inplace=True)
print(" Missing values after imputation:")
print(UTI_Gold_df.isna().sum())

 Missing values after imputation:
MF_NAV          0
ETF_Price       0
ETF_Open        0
ETF_High        0
ETF_Low         0
ETF_Vol.        0
ETF_Change %    0
USD_Price       0
USD_Open        0
USD_High        0
USD_Low         0
USD_Change %    0
Gold_Open       0
Gold_High       0
Gold_Low        0
Gold_Close      0
Gold_Volume     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  UTI_Gold_df[column].bfill(inplace=True)


In [6]:
UTI_Gold_df.dtypes

MF_NAV          float64
ETF_Price       float64
ETF_Open        float64
ETF_High        float64
ETF_Low         float64
ETF_Vol.         object
ETF_Change %     object
USD_Price       float64
USD_Open        float64
USD_High        float64
USD_Low         float64
USD_Change %     object
Gold_Open       float64
Gold_High       float64
Gold_Low        float64
Gold_Close      float64
Gold_Volume     float64
dtype: object

In [7]:
UTI_Gold_df.index = pd.to_datetime(UTI_Gold_df.index, errors='coerce')
UTI_Gold_df.head()

Unnamed: 0_level_0,MF_NAV,ETF_Price,ETF_Open,ETF_High,ETF_Low,ETF_Vol.,ETF_Change %,USD_Price,USD_Open,USD_High,USD_Low,USD_Change %,Gold_Open,Gold_High,Gold_Low,Gold_Close,Gold_Volume
All_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-01-02,10.8613,47.2,46.85,47.65,46.65,71.20K,0.53%,82.745,82.67,82.8,82.56,0.03%,1836.199951,1839.699951,1836.199951,1839.699951,29.0
2023-01-03,10.9525,47.6,47.85,47.85,47.35,133.23K,0.85%,82.774,82.698,83.038,82.654,0.04%,1836.199951,1839.699951,1836.199951,1839.699951,29.0
2023-01-04,11.0553,48.05,47.65,48.1,47.5,119.40K,0.95%,82.641,82.779,82.919,82.667,-0.16%,1845.599976,1859.099976,1845.599976,1852.800049,25.0
2023-01-05,10.991,47.8,48.0,48.05,47.7,53.18K,-0.52%,82.568,82.74,82.808,82.426,-0.09%,1855.199951,1855.199951,1834.800049,1834.800049,24.0
2023-01-06,10.9449,47.6,48.1,48.1,47.45,90.70K,-0.42%,82.27,82.604,82.785,82.275,-0.36%,1838.400024,1868.199951,1835.300049,1864.199951,26.0


In [8]:
#To use the data in time series analysis we should convert the object columns into numerical 
vol_str=UTI_Gold_df['ETF_Vol.']
def convert_volume(vol_str):
    vol_str = str(vol_str).upper().strip()
    if 'K' in vol_str:
        return float(vol_str.replace('K', '')) * 1_000
    else:
         return pd.to_numeric(vol_str, errors='coerce')

UTI_Gold_df['ETF_Vol.'] = UTI_Gold_df['ETF_Vol.'].apply(convert_volume)

In [9]:
UTI_Gold_df['ETF_Change %'] = UTI_Gold_df['ETF_Change %'].str.replace('%', '').astype(float)


In [10]:
UTI_Gold_df['USD_Change %'] = UTI_Gold_df['USD_Change %'].str.replace('%', '').astype(float)

In [11]:
UTI_Gold_df.head()

Unnamed: 0_level_0,MF_NAV,ETF_Price,ETF_Open,ETF_High,ETF_Low,ETF_Vol.,ETF_Change %,USD_Price,USD_Open,USD_High,USD_Low,USD_Change %,Gold_Open,Gold_High,Gold_Low,Gold_Close,Gold_Volume
All_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-01-02,10.8613,47.2,46.85,47.65,46.65,71200.0,0.53,82.745,82.67,82.8,82.56,0.03,1836.199951,1839.699951,1836.199951,1839.699951,29.0
2023-01-03,10.9525,47.6,47.85,47.85,47.35,133230.0,0.85,82.774,82.698,83.038,82.654,0.04,1836.199951,1839.699951,1836.199951,1839.699951,29.0
2023-01-04,11.0553,48.05,47.65,48.1,47.5,119400.0,0.95,82.641,82.779,82.919,82.667,-0.16,1845.599976,1859.099976,1845.599976,1852.800049,25.0
2023-01-05,10.991,47.8,48.0,48.05,47.7,53180.0,-0.52,82.568,82.74,82.808,82.426,-0.09,1855.199951,1855.199951,1834.800049,1834.800049,24.0
2023-01-06,10.9449,47.6,48.1,48.1,47.45,90700.0,-0.42,82.27,82.604,82.785,82.275,-0.36,1838.400024,1868.199951,1835.300049,1864.199951,26.0


In [12]:
y=UTI_Gold_df['MF_NAV']
X=UTI_Gold_df.drop('MF_NAV',axis=1)

In [13]:
X.shape

(489, 16)

In [14]:
y.shape

(489,)

In [15]:
X.isnull().sum()

ETF_Price        0
ETF_Open         0
ETF_High         0
ETF_Low          0
ETF_Vol.        20
ETF_Change %     0
USD_Price        0
USD_Open         0
USD_High         0
USD_Low          0
USD_Change %     0
Gold_Open        0
Gold_High        0
Gold_Low         0
Gold_Close       0
Gold_Volume      0
dtype: int64

In [16]:
for column in X.columns:
    if X[column].isna().sum() > 0:
        X[column].bfill(inplace=True)
print(" Missing values after imputation:")
print(X.isna().sum())

 Missing values after imputation:
ETF_Price       0
ETF_Open        0
ETF_High        0
ETF_Low         0
ETF_Vol.        0
ETF_Change %    0
USD_Price       0
USD_Open        0
USD_High        0
USD_Low         0
USD_Change %    0
Gold_Open       0
Gold_High       0
Gold_Low        0
Gold_Close      0
Gold_Volume     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].bfill(inplace=True)


## Evaluating ARIMA model for the data

In [17]:
from statsmodels.tsa.arima.model import ARIMA

model_arima = ARIMA(y,X, order=(1,1,1)).fit()
print("ARIMA AIC:", model_arima.aic)
print("ARIMA BIC:", model_arima.bic)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA AIC: -5634.482186609689
ARIMA BIC: -5554.86619389848




## Evaluating SARIMAX model for the data

In [18]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

model_sarimax = SARIMAX(y,X, order=(1,1,1), seasonal_order=(1,1,1,28)).fit()
print("SARIMAX AIC:", model_sarimax.aic)
print("SARIMAX BIC:", model_sarimax.bic)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMAX AIC: -5181.897635417108
SARIMAX BIC: -5095.141879137962


## Evaluating GARCH model for the data

The **GARCH (Generalized Autoregressive Conditional Heteroskedasticity)** model in Python is primarily found within the arch package. This package is specifically designed for modeling time series data with heteroskedasticity, a condition where the volatility of a series changes over time. To use GARCH models, it is first necessary to install the arch package

In [23]:
from arch import arch_model

model_garch = arch_model(y,X, vol='ARCH', p=1, q=1).fit()
print("ARCH AIC:", model_garch.aic)
print("ARCH BIC:", model_garch.bic)

Iteration:      1,   Func. Count:      5,   Neg. LLF: 6968954475819.691
Iteration:      2,   Func. Count:     12,   Neg. LLF: 282093113.9439594
Iteration:      3,   Func. Count:     17,   Neg. LLF: 36078.45150804447
Iteration:      4,   Func. Count:     22,   Neg. LLF: 1956.1443165535552
Iteration:      5,   Func. Count:     27,   Neg. LLF: 1606.712966479211
Iteration:      6,   Func. Count:     32,   Neg. LLF: 731.1108387428251
Iteration:      7,   Func. Count:     37,   Neg. LLF: 524.6380889931363
Iteration:      8,   Func. Count:     41,   Neg. LLF: 561.2113627201691
Iteration:      9,   Func. Count:     50,   Neg. LLF: 137191.68268803906
Iteration:     10,   Func. Count:     56,   Neg. LLF: 1300.4148923706568
Iteration:     11,   Func. Count:     62,   Neg. LLF: 489.9790509059339
Iteration:     12,   Func. Count:     66,   Neg. LLF: 489.9543085601906
Iteration:     13,   Func. Count:     70,   Neg. LLF: 489.9271756916358
Iteration:     14,   Func. Count:     74,   Neg. LLF: 489.916

In [20]:
from arch import arch_model

model_garch = arch_model(y,X, vol='GARCH', p=1, q=1).fit()
print("GARCH AIC:", model_garch.aic)
print("GARCH BIC:", model_garch.bic)

Iteration:      1,   Func. Count:      6,   Neg. LLF: 187945341103.52496
Iteration:      2,   Func. Count:     15,   Neg. LLF: 1448226090.2339172
Iteration:      3,   Func. Count:     21,   Neg. LLF: 769.9854698359719
Iteration:      4,   Func. Count:     27,   Neg. LLF: 3873.2643386108066
Iteration:      5,   Func. Count:     34,   Neg. LLF: 3531.7526231852517
Iteration:      6,   Func. Count:     40,   Neg. LLF: 522.8516018818497
Iteration:      7,   Func. Count:     46,   Neg. LLF: 520.4830784605891
Iteration:      8,   Func. Count:     52,   Neg. LLF: 576.1623431011423
Iteration:      9,   Func. Count:     58,   Neg. LLF: 575.6040512965284
Iteration:     10,   Func. Count:     64,   Neg. LLF: 562.3849323984737
Iteration:     11,   Func. Count:     70,   Neg. LLF: 576.4433678321523
Iteration:     12,   Func. Count:     76,   Neg. LLF: 493.7170004651451
Iteration:     13,   Func. Count:     82,   Neg. LLF: 752.2566986855779
Iteration:     14,   Func. Count:     88,   Neg. LLF: 489.64

In [19]:
!pip install arch



## Evaluating multivariate LSTM model for the data

In [42]:
y=UTI_Gold_df['MF_NAV']
X=UTI_Gold_df.drop('MF_NAV',axis=1)

In [62]:
import pandas as pd
import numpy as np

# Load the dataset
try:
    df = pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv')

    # --- Initial Inspection ---
    print("--- Data Info ---")
    df.info()

    print("\n--- First 5 Rows ---")
    print(df.head())

    print("\n--- Last 5 Rows ---")
    print(df.tail())

    print("\n--- Missing Values (Initial) ---")
    print(df.isnull().sum())

    # Check for non-numeric placeholders like ' , , , ,' which read_csv might miss
    print("\n--- Unique values in Gold_Open (example) ---")
    # Replacing potential empty strings or spaces read as objects before checking isnull again
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    # Also replace specific non-numeric placeholders if suspected
    df.replace(',', np.nan, inplace=True) # Replace single commas if they exist alone in cells
    print(df['Gold_Open'].unique()[:20]) # Show some unique values

    print("\n--- Missing Values (After replacing placeholders) ---")
    # Re-check nulls after replacement
    print(df.isnull().sum())


except FileNotFoundError:
    print("Error: input_file_0.csv not found.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   All_Date      489 non-null    object 
 1   MF_NAV        489 non-null    float64
 2   ETF_Price     489 non-null    float64
 3   ETF_Open      489 non-null    float64
 4   ETF_High      489 non-null    float64
 5   ETF_Low       489 non-null    float64
 6   ETF_Vol.      489 non-null    object 
 7   ETF_Change %  489 non-null    object 
 8   USD_Price     489 non-null    float64
 9   USD_Open      489 non-null    float64
 10  USD_High      489 non-null    float64
 11  USD_Low       489 non-null    float64
 12  USD_Change %  489 non-null    object 
 13  Gold_Open     473 non-null    float64
 14  Gold_High     473 non-null    float64
 15  Gold_Low      473 non-null    float64
 16  Gold_Close    473 non-null    float64
 17  Gold_Volume   473 non-null    float64
dtypes: float64(1

In [63]:
# Function to clean volume strings (K, M)
def clean_volume(volume_str):
    if isinstance(volume_str, (int, float)):
        return volume_str
    volume_str = str(volume_str).strip()
    if volume_str.endswith('K'):
        return float(volume_str[:-1]) * 1000
    elif volume_str.endswith('M'):
        return float(volume_str[:-1]) * 1000000
    else:
        try:
            return float(volume_str)
        except ValueError:
            return np.nan # Return NaN if conversion fails

# Function to clean percentage strings (%)
def clean_percentage(perc_str):
    if isinstance(perc_str, (int, float)):
        return perc_str
    perc_str = str(perc_str).strip()
    if perc_str.endswith('%'):
        try:
            # Remove '%' and convert to float.
            # No need to divide by 100 here as scaling will handle magnitudes
            return float(perc_str[:-1])
        except ValueError:
            return np.nan
    else:
        try:
            return float(perc_str)
        except ValueError:
            return np.nan # Return NaN if conversion fails

# --- Preprocessing ---

# 1. Parse Date and Set Index
try:
    df['All_Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y') # Adjust format if needed
    df.set_index('All_Date', inplace=True)
    df.sort_index(inplace=True) # Ensure chronological order
    print("Date parsing and indexing successful.")
except ValueError as e:
    print(f"Date parsing error: {e}. Trying alternative formats...")
    # Add alternative formats if the first one fails
    try:
        df['All_Date'] = pd.to_datetime(df['All_Date']) # Try letting pandas infer
        df.set_index('All_Date', inplace=True)
        df.sort_index(inplace=True)
        print("Date parsing (inferred format) and indexing successful.")
    except Exception as e_inner:
        print(f"Could not parse date column: {e_inner}")
        # Stop execution if date parsing fails
        raise SystemExit("Stopping due to date parsing failure.")


# 2. Clean Object Columns
df['ETF_Vol.'] = df['ETF_Vol.'].apply(clean_volume)
df['ETF_Change %'] = df['ETF_Change %'].apply(clean_percentage)
df['USD_Change %'] = df['USD_Change %'].apply(clean_percentage)
print("Object columns cleaned.")

# 3. Handle Missing Values (Forward Fill)
# Check NaNs *after* cleaning, as cleaning might introduce NaNs
print("\n--- Missing Values Before Fill ---")
print(df.isnull().sum())

df.fillna(method='ffill', inplace=True)
# Check if any NaNs remain (especially at the beginning if the first row had NaNs)
if df.isnull().values.any():
     df.fillna(method='bfill', inplace=True) # Backfill if needed for leading NaNs

print("\n--- Missing Values After Fill ---")
print(df.isnull().sum())


# 4. Verify Data Types
print("\n--- Data Types After Cleaning ---")
print(df.dtypes)

print("\n--- Data Head After Preprocessing ---")
print(df.head())

Date parsing and indexing successful.
Object columns cleaned.

--- Missing Values Before Fill ---
MF_NAV           0
ETF_Price        0
ETF_Open         0
ETF_High         0
ETF_Low          0
ETF_Vol.         0
ETF_Change %     0
USD_Price        0
USD_Open         0
USD_High         0
USD_Low          0
USD_Change %     0
Gold_Open       16
Gold_High       16
Gold_Low        16
Gold_Close      16
Gold_Volume     16
dtype: int64

--- Missing Values After Fill ---
MF_NAV          0
ETF_Price       0
ETF_Open        0
ETF_High        0
ETF_Low         0
ETF_Vol.        0
ETF_Change %    0
USD_Price       0
USD_Open        0
USD_High        0
USD_Low         0
USD_Change %    0
Gold_Open       0
Gold_High       0
Gold_Low        0
Gold_Close      0
Gold_Volume     0
dtype: int64

--- Data Types After Cleaning ---
MF_NAV          float64
ETF_Price       float64
ETF_Open        float64
ETF_High        float64
ETF_Low         float64
ETF_Vol.        float64
ETF_Change %    float64
USD_Price

  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True) # Backfill if needed for leading NaNs


In [64]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split # Although we'll split sequentially

# --- Feature Selection, Scaling, Sequencing ---

# 1. Select Features (X) and Target (y)
target_col = 'MF_NAV'
features = df.drop(target_col, axis=1)
target = df[[target_col]] # Keep as DataFrame

# Use all other columns as features
feature_cols = features.columns.tolist()
print(f"Target Column: {target_col}")
print(f"Feature Columns: {feature_cols}")
print(f"Number of Features: {len(feature_cols)}")

# 2. Train/Test Split (Sequential)
split_ratio = 0.8
split_index = int(len(df) * split_ratio)

X_train_df = features[:split_index]
X_test_df = features[split_index:]
y_train_df = target[:split_index]
y_test_df = target[split_index:]

print(f"\nTraining set size: {len(X_train_df)}")
print(f"Test set size: {len(X_test_df)}")

# 3. Scaling
# Initialize separate scalers for features and target
feature_scaler = MinMaxScaler(feature_range=(0, 1))
target_scaler = MinMaxScaler(feature_range=(0, 1))

# Fit scalers ONLY on training data
feature_scaler.fit(X_train_df)
target_scaler.fit(y_train_df)

# Transform both training and test sets
X_train_scaled = feature_scaler.transform(X_train_df)
X_test_scaled = feature_scaler.transform(X_test_df)
y_train_scaled = target_scaler.transform(y_train_df)
y_test_scaled = target_scaler.transform(y_test_df) # Will be used for comparison later

# 4. Create Sequences
# Function to create sequences
def create_sequences(features, target, time_steps=60):
    Xs, ys = [], []
    for i in range(len(features) - time_steps):
        Xs.append(features[i:(i + time_steps)]) # Sequence of features
        ys.append(target[i + time_steps])       # Target value after the sequence
    return np.array(Xs), np.array(ys)

# Define sequence length (number of past days to use for prediction)
TIME_STEPS = 60

# Create sequences for training and testing sets
X_train, y_train = create_sequences(X_train_scaled, y_train_scaled, TIME_STEPS)
X_test, y_test = create_sequences(X_test_scaled, y_test_scaled, TIME_STEPS)

print(f"\nTraining sequences shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing sequences shape: X={X_test.shape}, y={y_test.shape}")

# Store original test values for later comparison (after inverse scaling)
# Need to align y_test_df with the sequences created
original_y_test = y_test_df.iloc[TIME_STEPS:].values

# Store test dates for plotting
test_dates = X_test_df.index[TIME_STEPS:]

Target Column: MF_NAV
Feature Columns: ['ETF_Price', 'ETF_Open', 'ETF_High', 'ETF_Low', 'ETF_Vol.', 'ETF_Change %', 'USD_Price', 'USD_Open', 'USD_High', 'USD_Low', 'USD_Change %', 'Gold_Open', 'Gold_High', 'Gold_Low', 'Gold_Close', 'Gold_Volume']
Number of Features: 16

Training set size: 391
Test set size: 98

Training sequences shape: X=(331, 60, 16), y=(331, 1)
Testing sequences shape: X=(38, 60, 16), y=(38, 1)


In [65]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- Build and Train LSTM Model ---

# 1. Define Model Architecture
model = Sequential()

# Input LSTM layer
model.add(LSTM(units=64, # Number of LSTM units/neurons
               return_sequences=True, # Return sequences for stacking LSTM layers
               input_shape=(X_train.shape[1], X_train.shape[2]))) # (timesteps, features)
model.add(Dropout(0.2)) # Dropout for regularization

# Second LSTM layer
model.add(LSTM(units=32,
               return_sequences=False)) # False as it's the last LSTM layer before Dense
model.add(Dropout(0.2))

# Output Layer
model.add(Dense(units=1)) # Predicting a single value (MF_NAV)

# 2. Compile Model
model.compile(optimizer='adam', loss='mean_squared_error') # MSE for regression

# Print model summary
model.summary()

# 3. Train Model
# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
# Use a portion of training data for validation during training
history = model.fit(
    X_train, y_train,
    epochs=100, # Max epochs; early stopping might halt sooner
    batch_size=32,
    validation_split=0.1, # Use last 10% of training data for validation
    callbacks=[early_stopping],
    shuffle=False, # Important for time series data
    verbose=1 # Show training progress
)

print("\nModel training finished.")

  super().__init__(**kwargs)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 83ms/step - loss: 0.0653 - val_loss: 0.0399
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.0493 - val_loss: 0.0216
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.0057 - val_loss: 0.0103
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.0095 - val_loss: 0.0088
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.0102 - val_loss: 0.0076
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.0048 - val_loss: 0.0091
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.0040 - val_loss: 0.0108
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.0035 - val_loss: 0.0079
Epoch 9/100
[1m10/10[0m [32m━━━━━━━━━