## 1. Load Necessary Libraries

In [29]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
import torch
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 2. Set seed for reproducibility

In [30]:
# -----------------------------
# Reproducibility
# -----------------------------
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    print("CUDA is available. Setting seed for all GPUs.")
    torch.cuda.manual_seed_all(RANDOM_SEED)

sns.set_style("darkgrid")


CUDA is available. Setting seed for all GPUs.


## 3. Set paths

In [31]:
# -----------------------------
# Project Root Resolution
# -----------------------------
PROJECT_ROOT = Path.cwd().resolve().parents[0]

DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"

DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

DATA_RAW_DIR, DATA_INTERIM_DIR, FIGURES_DIR


(WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/raw'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/reports/figures'))

## 4. Load Data from data/interim/

In [32]:
path = DATA_INTERIM_DIR /"features_dataset.csv"
df = pd.read_csv(path, parse_dates=["Date"])

## 5. Separate Features and Target

In [33]:
TARGET_COL = "target_log_return"

X_df = df.drop(columns=["Date", "StockPrice", TARGET_COL], errors="ignore")
y = df[TARGET_COL].values


## 6. Feature Scaling 

 Why scale?
 - Neural networks are sensitive to scale
 - Returns & volatility must be normalized
 - Scaling is done before sequence construction

In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df.values)


## 7. Sequence Construction

In [35]:
def create_sequences(X, y, seq_len):
    X_seq, y_seq = [], []

    for i in range(seq_len, len(X)):
        X_seq.append(X[i - seq_len:i])
        y_seq.append(y[i])

    return np.array(X_seq), np.array(y_seq)


In [36]:
SEQUENCE_LENGTH = 30  # past 30 days â†’ predict next day
X_seq, y_seq = create_sequences(X_scaled, y, SEQUENCE_LENGTH)

X_seq.shape, y_seq.shape

((3721, 30, 31), (3721,))

## 8. Train / Validation / Test Split (Time-Aware)

In [37]:
n_samples = len(X_seq)

train_end = int(n_samples * 0.7)
val_end = int(n_samples * 0.85)

X_train = X_seq[:train_end]
y_train = y_seq[:train_end]

X_val = X_seq[train_end:val_end]
y_val = y_seq[train_end:val_end]

X_test = X_seq[val_end:]
y_test = y_seq[val_end:]


## 9. Save Prepared Datasets

In [38]:
np.save(DATA_PROCESSED_DIR / "splits" /"X_train.npy", X_train)
np.save(DATA_PROCESSED_DIR / "splits" /"y_train.npy", y_train)

np.save(DATA_PROCESSED_DIR / "splits" /"X_val.npy", X_val)
np.save(DATA_PROCESSED_DIR / "splits" /"y_val.npy", y_val)

np.save(DATA_PROCESSED_DIR / "splits" /"X_test.npy", X_test)
np.save(DATA_PROCESSED_DIR / "splits" /"y_test.npy", y_test)

print("Sequence datasets saved successfully.")


Sequence datasets saved successfully.


## 10. Save Scaler (Inference Compatibility)

In [39]:
import joblib

scaler_path = PROJECT_ROOT / "references"/"feature_scaler.pkl"
joblib.dump(scaler, scaler_path)

print(f"Scaler saved to {scaler_path}")


Scaler saved to C:\Users\Kinjal Mitra\Documents\stock-price-prediction-ff\references\feature_scaler.pkl


## 11. Save Feature Dataset

In [40]:
processed_path = DATA_PROCESSED_DIR / "processed_dataset.csv"
df.to_csv(processed_path)

processed_path

WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/processed/processed_dataset.csv')