In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

# ✅ Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
""" MODEL STUFF """
# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])  # Last time step output

# Helper: Create sequences for LSTM
def create_sequences(data, dates, seq_len=30):
    X, y, y_dates = [], [], []
    data_array = data  # Already a NumPy array
    for i in range(len(data_array) - seq_len):
        X.append(data_array[i:i+seq_len])
        y.append(data_array[i+seq_len, 0])
        y_dates.append(dates[i+seq_len])
    return np.array(X), np.array(y), np.array(y_dates)

# Helper: Inverse scale predictions for VN-INDEX (first column)
def inverse_scale_predictions(predictions, scaler):
    num_features = scaler.min_.shape[0]
    dummy = np.zeros((predictions.shape[0], num_features))
    dummy[:, 0] = predictions.flatten()
    return scaler.inverse_transform(dummy)[:, 0]

# Training and evaluation function (used for hyperparameter tuning)
def train_evaluate_model(params, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs=20):
    model = LSTMModel(
        input_size=X_train_tensor.shape[2],
        hidden_size=params['hidden_size'],
        num_layers=params['num_layers'],
        dropout=params['dropout']
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    criterion = nn.MSELoss()
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
    
    for _ in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(batch_X), batch_y)
            loss.backward()
            optimizer.step()
    # Evaluate on test set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            val_loss += criterion(model(batch_X), batch_y).item()
    return val_loss / len(test_loader)

# Optuna objective function
def objective(trial, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor):
    params = {
        'hidden_size': trial.suggest_categorical('hidden_size', [64, 128, 256]),
        'num_layers': trial.suggest_int('num_layers', 1, 3),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128])
    }
    return train_evaluate_model(params, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# Main function: training, tuning, evaluation, and future prediction
def lstm_model_pipeline(data, seq_len=30, tuning=False, best_params={'hidden_size': 128, 'num_layers': 2,'dropout': 0.3, 
    'learning_rate': 1e-4, 'batch_size': 32}):

    if isinstance(data, pd.Series):
        data = data.to_frame()

    # Normalize data: fit scaler on first 90% of data, then transform all
    train_size = int(0.9 * len(data))
    scaler = MinMaxScaler()
    scaler.fit(data[:train_size])
    data_scaled = scaler.transform(data)
    
    # Create sequences and split into train/test based on original data indices
    X, y, y_dates = create_sequences(data_scaled, data.index, seq_len)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    y_dates_train, y_dates_test = y_dates[:train_size], y_dates[train_size:]
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1).to(device)
    
    if tuning:
        # Hyperparameter tuning with Optuna (fewer trials for speed)
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor),
                    n_trials=10)
        best_params = study.best_params
        print("Best Hyperparameters:", best_params)
    
    # Train final model using best hyperparameters
    final_model = LSTMModel(
        input_size=X_train_tensor.shape[2],
        hidden_size=best_params['hidden_size'],
        num_layers=best_params['num_layers'],
        dropout=best_params['dropout']
    ).to(device)
    optimizer = optim.Adam(final_model.parameters(), lr=best_params['learning_rate'])
    criterion = nn.MSELoss()
    batch_size = best_params['batch_size']
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    epochs = 50
    train_losses, val_losses = [], []
    for epoch in range(epochs):
        final_model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            loss = criterion(final_model(batch_X), batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        train_losses.append(epoch_loss / len(train_loader))
        
        final_model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                val_loss += criterion(final_model(batch_X), batch_y).item()
        val_losses.append(val_loss / len(test_loader))
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")
    
    # Plot losses
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label="Train Loss", marker='o')
    plt.plot(val_losses, label="Val Loss", marker='s')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Training and Validation Loss")
    plt.grid(True)
    plt.show()
    
    # Evaluate final model
    final_model.eval()
    with torch.no_grad():
        y_pred_tensor = final_model(X_test_tensor).cpu().numpy()
        y_true_tensor = y_test_tensor.cpu().numpy()
    
    y_pred = inverse_scale_predictions(y_pred_tensor, scaler)
    y_true = inverse_scale_predictions(y_true_tensor, scaler)
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    
    plt.figure(figsize=(12,6))
    plt.plot(y_dates_test, y_true, label="Actual VN-INDEX", marker='o', color="blue")
    plt.plot(y_dates_test, y_pred, label="Predicted VN-INDEX", marker='s', linestyle="dashed", color="red")
    plt.xlabel("Date")
    plt.ylabel("VN-INDEX")
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.title("LSTM Predictions vs. Actual VN-INDEX")
    plt.show()

    # Calculate percentage change between successive values for both actual and predicted series
    actual_pct_change = np.diff(y_true.flatten()) / y_true.flatten()[:-1] * 100
    pred_pct_change = np.diff(y_pred.flatten()) / y_pred.flatten()[:-1] * 100

    plt.figure(figsize=(12,6))
    plt.plot(actual_pct_change, label="Actual % Change", marker='s', color="blue")
    plt.plot(pred_pct_change, label="Predicted % Change", marker='s', color="red")
    plt.xlabel("Time Step")
    plt.ylabel("Percentage Change (%)")
    plt.title("Percentage Change Comparison: Actual vs. Predicted")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.show()

    # Print a table of results (first 10 rows)
    results_df = pd.DataFrame({
        "Date": y_dates_test,
        "Actual VN-INDEX": y_true,
        "Predicted VN-INDEX": y_pred
    })
    print("Predicted vs. Actual VN-INDEX (Test Set):")
    print(results_df)
    
    return final_model, X_test_tensor, scaler, y_pred

# Future Prediction Function
def future_prediction(X_test, y_pred, data, scaler, model, num_days=30):

    if isinstance(data, pd.Series):
        data = data.to_frame()

    model.eval()
    input_seq = X_test[-1].cpu().numpy()
    future_preds = []
    for _ in range(num_days):
        input_tensor = torch.tensor(input_seq, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            pred = model(input_tensor).cpu().numpy()[0, 0]
        future_preds.append(pred)
        input_seq = np.roll(input_seq, -1, axis=0)
        input_seq[-1, 0] = pred  # update the VN-INDEX feature
    future_preds = inverse_scale_predictions(np.array(future_preds).reshape(-1,1), scaler)
    
    last_date = data.index[-1]
    future_dates = []
    while len(future_dates) < num_days:
        last_date += pd.Timedelta(days=1)
        if last_date.weekday() < 5:
            future_dates.append(last_date)
    
    plt.figure(figsize=(12,6))
    plt.plot(future_dates, future_preds, marker='o', linestyle="dashed", color="red", label="Predicted VN-INDEX")
    plt.xlabel("Date")
    plt.ylabel("VN-INDEX")
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.title(f"Predicted VN-INDEX for Next {num_days} Trading Days")
    plt.show()

    # Overlay historical data and future predictions
    historical_dates = data.index[-100:]
    historical_values = data.iloc[-100:, 0].values  # assuming VN-INDEX is in the first column
    plt.figure(figsize=(12,6))
    plt.plot(historical_dates, historical_values, label="Historical VN-INDEX", color="blue")
    plt.plot(historical_dates, y_pred[-100:], label="Test Predictions", color="red")
    plt.plot(future_dates, future_preds, color="green", label="Future Predicted VN-INDEX")
    plt.xlabel("Date")
    plt.ylabel("VN-INDEX")
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.title("Historical VN-INDEX with Future Predictions")
    plt.show()
    
    future_df = pd.DataFrame({"Date": future_dates, "Predicted VN-INDEX": future_preds})
    print(future_df)

# Example Usage:
# Assuming `data` is a NumPy array of your features and `df` is your DataFrame with dates as index.
# final_model, X_test_tensor, scaler, y_dates_test = lstm_model_pipeline(data, df)
# future_prediction(X_test_tensor, df, scaler, final_model, num_days=30)

In [None]:
""" DATA FEATURES """
def compute_RSI(series, window=14):
        delta = series.diff(1)
        gain = (delta.where(delta > 0, 0)).rolling(window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window).mean()
        RS = gain / loss
        return 100 - (100 / (1 + RS))

def lag_features_indicators(df, numerical_columns):
    copy_df = df.copy()
    # 📌 Apply Lagged Features to Every Column
    lag_days = [1, 2, 3, 5, 10]  # Lags of 1, 2, 3, 5, and 10 days
    for col in numerical_columns:
        for lag in lag_days:
            copy_df[f'{col}_Lag{lag}'] = copy_df[col].shift(lag)

    # 📌 Apply Simple Moving Averages (SMA) and Exponential Moving Averages (EMA) to Every Column
    for col in numerical_columns:
        copy_df[f'{col}_SMA_10'] = copy_df[col].rolling(window=10).mean()
        copy_df[f'{col}_SMA_20'] = copy_df[col].rolling(window=20).mean()
        copy_df[f'{col}_EMA_10'] = copy_df[col].ewm(span=10, adjust=False).mean()
        copy_df[f'{col}_EMA_20'] = copy_df[col].ewm(span=20, adjust=False).mean()

    # 📌 Apply Relative Strength Index (RSI) to Every Column
    for col in numerical_columns:
        copy_df[f'{col}_RSI_14'] = compute_RSI(copy_df[col])

    # 📌 Apply Moving Average Convergence Divergence (MACD) to Every Column
    for col in numerical_columns:
        copy_df[f'{col}_EMA_12'] = copy_df[col].ewm(span=12, adjust=False).mean()
        copy_df[f'{col}_EMA_26'] = copy_df[col].ewm(span=26, adjust=False).mean()
        copy_df[f'{col}_MACD'] = copy_df[f'{col}_EMA_12'] - copy_df[f'{col}_EMA_26']

    # 📌 Drop NA values caused by shifting and rolling
    copy_df.dropna(inplace=True)

    return copy_df

def quicky_data(df):
    # 🕒 Convert 'Date' to datetime and set as index
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    # 📌 Drop unnecessary columns
    if 'Index' in df.columns:
        df.drop(columns=['Index'], inplace=True)
    
    return df

In [None]:
# 📂 Load dataset
file_path_1 = "../ready_data/cleaned_hose_historical_data.csv"
df_1 = pd.read_csv(file_path_1)
df_1 = quicky_data(df_1)

In [None]:
# 📊 Select only VN-INDEX for prediction
data = df_1["VN_Index_Close"]

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
# 📊 Select additional VN-INDEX statistics for prediction
data = df_1

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
# 📊 Select additional VN-INDEX statistics for prediction
data = lag_features_indicators(df_1, ['VN_Index_Close'])

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
# 📊 Select additional VN-INDEX statistics for prediction
data = lag_features_indicators(df_1, df_1.columns)

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
# 📂 Load dataset
file_path_2 = "../ready_data/vn_index_external_data.csv"
df_2 = pd.read_csv(file_path_2)
df_2 = quicky_data(df_2)

In [None]:
data = df_2

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_2, ['VN_Index_Close'])

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_2, df_2.columns)

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
# 📂 Load dataset
file_path_3 = "../ready_data/merged_data.csv"
df_3 = pd.read_csv(file_path_3)
df_3 = quicky_data(df_3)

In [None]:
data = df_3

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_3, ['VN_Index_Close'])

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_3, df_3.columns)

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_3, df_1.columns)

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data = lag_features_indicators(df_3, df_2.columns)

# 🚀 Train the model and get the test set
model, X_test_tensor, scaler, y_pred = lstm_model_pipeline(data)

# 🔮 Generate future predictions
future_prediction(X_test_tensor, y_pred, data, scaler, model, num_days=30)

In [None]:
data