## 1. Load Necessary Libraries

In [98]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
import torch
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 2. Set seed for reproducibility

In [99]:
# -----------------------------
# Reproducibility
# -----------------------------
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    print("CUDA is available. Setting seed for all GPUs.")
    torch.cuda.manual_seed_all(RANDOM_SEED)

sns.set_style("darkgrid")


CUDA is available. Setting seed for all GPUs.


## 3. Set paths

In [100]:
# -----------------------------
# Project Root Resolution
# -----------------------------
PROJECT_ROOT = Path.cwd().resolve().parents[0]

DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"

DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

DATA_RAW_DIR, DATA_INTERIM_DIR, FIGURES_DIR


(WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/raw'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/reports/figures'))

## 4. Load Data from data/interim/

In [101]:
path = DATA_INTERIM_DIR /"features_dataset.csv"
df = pd.read_csv(path, parse_dates=["Date"])

## 5. Drop Empty Rows (Created due to moving averages and other features)
- We do this to avoid forward/backward filling for price-derived features.

In [102]:
df = df.dropna().reset_index(drop=True)

### 6. Check order of Date

In [103]:
df["Date"].is_monotonic_decreasing

True

In [104]:
df = df.sort_values("Date", ascending=True).reset_index(drop=True)
df["Date"]

0      2010-01-04
1      2010-01-05
2      2010-01-06
3      2010-01-07
4      2010-01-08
          ...    
3748   2025-01-08
3749   2025-01-09
3750   2025-01-10
3751   2025-01-13
3752   2025-01-14
Name: Date, Length: 3753, dtype: datetime64[ns]

### 7. Set Target Column

In [105]:
col_to_drop = "Unnamed: 0"
df = df.drop(columns=[col_to_drop])
df.shape

(3753, 28)

In [106]:
FEATURE_COLS = [col for col in df.columns if col not in ["Date", "log_return"]]

X = df[FEATURE_COLS].values
y = df["log_return"].values


In [107]:
X.shape, y.shape

((3753, 26), (3753,))

### 8. Scale Features

In [108]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)

In [109]:
# Verify scaling
print(X_scaled.min(), X_scaled.max())

0.0 1.0000000000000002


In [110]:
# Save scaler for future use
import joblib
scaler_path = PROJECT_ROOT / "references" / "feature_scaler.pkl"
joblib.dump(scaler, scaler_path)

['C:\\Users\\Kinjal Mitra\\Documents\\stock-price-prediction-ff\\references\\feature_scaler.pkl']

## 9. Save Feature Dataset

In [111]:
processed_path = DATA_PROCESSED_DIR / "processed_dataset.csv"
df.to_csv(processed_path)

processed_path

WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/processed/processed_dataset.csv')

### 10. Save X_scaled and y

In [112]:
np.save(DATA_PROCESSED_DIR / "X_features.npy", X_scaled)
np.save(DATA_PROCESSED_DIR / "y_target.npy", y)