# LSTM Fine-Tuning for Stock Prediction (Dummy Data)

Notebook ini membangun baseline model LSTM dan skenario fine-tuning menggunakan data saham dummy yang realistis. Fokus: pipeline yang terstruktur, reproducible, dan evaluasi yang jelas.

In [None]:
import os, sys, warnings, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import MinMaxScaler

# Local imports
sys.path.append(os.getcwd())
from src.data_generator import generate_stock_data, create_time_series_features
from src.utils import calculate_metrics, plot_stock_predictions, plot_training_history, save_model_summary

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 6)

print('TensorFlow:', tf.__version__)

In [None]:
# Config
SEED = 42
SOURCE_SEED = 101
TARGET_SEED = 202

SEQ_LEN = 60
BATCH_SIZE = 64
EPOCHS_BASELINE = 25
EPOCHS_PRETRAIN = 30
EPOCHS_FT_HEAD = 15
EPOCHS_FT_FULL = 10

LR_BASE = 1e-3
LR_FT = 1e-4
LR_FT_FULL = 5e-5

N_DAYS_SOURCE = 1500
N_DAYS_TARGET = 1000
SOURCE_START_PRICE = 100.0
TARGET_START_PRICE = 60.0
SOURCE_VOL = 0.015
TARGET_VOL = 0.025

TARGET_COL = 'Close'

OUT_DIR = 'notebooks'
MODEL_PATH = os.path.join(OUT_DIR, 'best_lstm_finetune_model.keras')
SUMMARY_PATH = os.path.join(OUT_DIR, 'best_lstm_finetune_model.txt')
METRICS_PATH = os.path.join(OUT_DIR, 'lstm_finetune_metrics.csv')

np.random.seed(SEED)
tf.random.set_seed(SEED)
os.makedirs(OUT_DIR, exist_ok=True)
print('Seeds set. Output dir:', OUT_DIR)

## Generate Dummy Data + Features

In [None]:
print('Generating source/target datasets...')
src_df = generate_stock_data(n_days=N_DAYS_SOURCE, start_price=SOURCE_START_PRICE, volatility=SOURCE_VOL, seed=SOURCE_SEED)
tgt_df = generate_stock_data(n_days=N_DAYS_TARGET, start_price=TARGET_START_PRICE, volatility=TARGET_VOL, seed=TARGET_SEED)

src_df = create_time_series_features(src_df)
tgt_df = create_time_series_features(tgt_df)

def clean_features(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    df = df[numeric_cols].dropna()
    return df

src_df_clean = clean_features(src_df)
tgt_df_clean = clean_features(tgt_df)

print(f'Source rows: {len(src_df_clean)} | Target rows: {len(tgt_df_clean)}')
print('Feature columns:', list(src_df_clean.columns))

In [None]:
print(f"Source Close range: {src_df_clean['Close'].min():.2f} - {src_df_clean['Close'].max():.2f}")
print(f"Target Close range: {tgt_df_clean['Close'].min():.2f} - {tgt_df_clean['Close'].max():.2f}")
fig, ax = plt.subplots(1, 2, figsize=(14,4))
ax[0].plot(src_df_clean['Close'].values); ax[0].set_title('Source Close'); ax[0].grid(True)
ax[1].plot(tgt_df_clean['Close'].values); ax[1].set_title('Target Close'); ax[1].grid(True)
plt.tight_layout(); plt.show()

## Prepare Sequences (Train/Val/Test)

In [None]:
def build_sequences_scaled(df, target_col, seq_len, train_frac=0.7, val_frac=0.15):
    arr = df.values.astype('float32')
    cols = df.columns.tolist()
    target_idx = cols.index(target_col)
    n = len(arr)
    train_end = int(n * train_frac)
    scaler = MinMaxScaler()
    scaler.fit(arr[:train_end])
    scaled = scaler.transform(arr)
    X, y = [], []
    for i in range(seq_len, n):
        X.append(scaled[i-seq_len:i])
        y.append(scaled[i, target_idx])
    X = np.array(X, dtype='float32')
    y = np.array(y, dtype='float32')
    N = len(X)
    t_train = int(N * train_frac)
    t_val = int(N * val_frac)
    X_train, y_train = X[:t_train], y[:t_train]
    X_val, y_val = X[t_train:t_train+t_val], y[t_train:t_train+t_val]
    X_test, y_test = X[t_train+t_val:], y[t_train+t_val:]
    return (X_train, y_train, X_val, y_val, X_test, y_test, scaler, target_idx, cols)

def inverse_scale_target(scaler, y_scaled, target_idx, n_features):
    y_scaled = np.asarray(y_scaled).reshape(-1, 1)
    zeros = np.zeros((len(y_scaled), n_features), dtype='float32')
    zeros[:, target_idx] = y_scaled[:, 0]
    inv = scaler.inverse_transform(zeros)
    return inv[:, target_idx]

def direction_accuracy(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    dy_true = np.diff(y_true)
    dy_pred = np.diff(y_pred)
    return (np.mean(np.sign(dy_true) == np.sign(dy_pred)) * 100.0)

src_splits = build_sequences_scaled(src_df_clean, TARGET_COL, SEQ_LEN)
tgt_splits = build_sequences_scaled(tgt_df_clean, TARGET_COL, SEQ_LEN)

(X_src_tr, y_src_tr, X_src_val, y_src_val, X_src_te, y_src_te, scaler_src, tgt_idx_src, cols_src) = src_splits
(X_tgt_tr, y_tgt_tr, X_tgt_val, y_tgt_val, X_tgt_te, y_tgt_te, scaler_tgt, tgt_idx_tgt, cols_tgt) = tgt_splits

n_features = X_tgt_tr.shape[2]; seq_len = X_tgt_tr.shape[1]
n_features, seq_len

## Build LSTM Model

In [None]:
def build_lstm(n_features, seq_len, lstm_units=(128, 64), dense_units=64, dropout=0.2, lr=1e-3):
    model = Sequential()
    model.add(LSTM(lstm_units[0], return_sequences=True, dropout=dropout, recurrent_dropout=dropout, input_shape=(seq_len, n_features), name='lstm_1'))
    model.add(BatchNormalization(name='bn_1'))
    model.add(LSTM(lstm_units[1], dropout=dropout, recurrent_dropout=dropout, name='lstm_2'))
    model.add(Dense(dense_units, activation='relu', name='dense_1'))
    model.add(Dropout(dropout, name='dropout_1'))
    model.add(Dense(1, activation='linear', name='output'))
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

def compile_callbacks():
    return [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
    ]

print('Model builder ready')

## Baseline: Train-from-scratch on Target

In [None]:
model_base = build_lstm(n_features, seq_len, lr=LR_BASE)
history_base = model_base.fit(
    X_tgt_tr, y_tgt_tr,
    validation_data=(X_tgt_val, y_tgt_val),
    epochs=EPOCHS_BASELINE,
    batch_size=BATCH_SIZE,
    callbacks=compile_callbacks(),
    verbose=1
)
plot_training_history(history_base)

y_pred_base_scaled = model_base.predict(X_tgt_te, verbose=0).reshape(-1)
y_test_scaled = y_tgt_te.reshape(-1)
y_pred_base = inverse_scale_target(scaler_tgt, y_pred_base_scaled, tgt_idx_tgt, n_features)
y_test = inverse_scale_target(scaler_tgt, y_test_scaled, tgt_idx_tgt, n_features)

metrics_base = calculate_metrics(y_test, y_pred_base)
metrics_base['Direction_Accuracy'] = direction_accuracy(y_test, y_pred_base)
print('Baseline metrics:', metrics_base)
plot_stock_predictions(y_test, y_pred_base, title='Baseline: Actual vs Predicted (Target)')

## Pretrain on Source + Fine-Tune on Target

In [None]:
model_ft = build_lstm(n_features, seq_len, lr=LR_BASE)
history_pre = model_ft.fit(
    X_src_tr, y_src_tr,
    validation_data=(X_src_val, y_src_val),
    epochs=EPOCHS_PRETRAIN,
    batch_size=BATCH_SIZE,
    callbacks=compile_callbacks(),
    verbose=1
)
plot_training_history(history_pre)

# Freeze lower layers for head fine-tune
for layer in model_ft.layers:
    if layer.name in ('lstm_1', 'bn_1'):
        layer.trainable = False
    else:
        layer.trainable = True
model_ft.compile(optimizer=Adam(learning_rate=LR_FT), loss='mse', metrics=['mae'])

history_ft_head = model_ft.fit(
    X_tgt_tr, y_tgt_tr,
    validation_data=(X_tgt_val, y_tgt_val),
    epochs=EPOCHS_FT_HEAD,
    batch_size=BATCH_SIZE,
    callbacks=compile_callbacks(),
    verbose=1
)
plot_training_history(history_ft_head)

# Unfreeze all for short full fine-tune
for layer in model_ft.layers:
    layer.trainable = True
model_ft.compile(optimizer=Adam(learning_rate=LR_FT_FULL), loss='mse', metrics=['mae'])
history_ft_full = model_ft.fit(
    X_tgt_tr, y_tgt_tr,
    validation_data=(X_tgt_val, y_tgt_val),
    epochs=EPOCHS_FT_FULL,
    batch_size=BATCH_SIZE,
    callbacks=compile_callbacks(),
    verbose=1
)
plot_training_history(history_ft_full)

# Evaluate on test
y_pred_ft_scaled = model_ft.predict(X_tgt_te, verbose=0).reshape(-1)
y_pred_ft = inverse_scale_target(scaler_tgt, y_pred_ft_scaled, tgt_idx_tgt, n_features)
metrics_ft = calculate_metrics(y_test, y_pred_ft)
metrics_ft['Direction_Accuracy'] = direction_accuracy(y_test, y_pred_ft)
print('Fine-tune metrics:', metrics_ft)
plot_stock_predictions(y_test, y_pred_ft, title='Fine-tuned: Actual vs Predicted (Target)')

# Save artifacts
model_ft.save(MODEL_PATH)
save_model_summary(model_ft, SUMMARY_PATH)
print('Saved model to:', MODEL_PATH)
print('Saved summary to:', SUMMARY_PATH)

# Compare metrics
df_metrics = pd.DataFrame([
    {'model': 'baseline', **metrics_base},
    {'model': 'fine_tune', **metrics_ft},
])
df_metrics.to_csv(METRICS_PATH, index=False)
df_metrics

## Short Forecast (Next 5 Days, Target Domain)

In [None]:
def forecast_n_steps(model, last_seq, n_steps, target_idx):
    seq = last_seq.copy()
    preds = []
    for _ in range(n_steps):
        yhat = model.predict(seq[np.newaxis, ...], verbose=0)[0, 0]
        preds.append(yhat)
        new_row = seq[-1].copy()
        new_row[target_idx] = yhat
        seq = np.vstack([seq[1:], new_row])
    return np.array(preds)

last_seq = X_tgt_te[-1]  # scaled space
preds_scaled = forecast_n_steps(model_ft, last_seq, n_steps=5, target_idx=tgt_idx_tgt)
preds_unscaled = inverse_scale_target(scaler_tgt, preds_scaled, tgt_idx_tgt, n_features)
print('Next 5-day forecast (target units):', np.round(preds_unscaled, 2))