## 1. Load Necessary Libraries

In [18]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
import torch
import datetime as dt
from sklearn.preprocessing import StandardScaler

## 2. Set seed for reproducibility

In [19]:
# -----------------------------
# Reproducibility
# -----------------------------
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    print("CUDA is available. Setting seed for all GPUs.")
    torch.cuda.manual_seed_all(RANDOM_SEED)

sns.set_style("darkgrid")


CUDA is available. Setting seed for all GPUs.


## 3. Set paths

In [20]:
# -----------------------------
# Project Root Resolution
# -----------------------------
PROJECT_ROOT = Path.cwd().resolve().parents[0]

DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"

DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

DATA_RAW_DIR, DATA_INTERIM_DIR, FIGURES_DIR


(WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/raw'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/reports/figures'))

## 4. Load Data from data/raw/

In [21]:
data_path = DATA_RAW_DIR / "Data.csv"
price_path = DATA_RAW_DIR / "StockPrice.csv"

df_data = pd.read_csv(data_path, parse_dates=["Date"])
df_price = pd.read_csv(price_path, parse_dates=["Date"])


In [22]:
print("\n df_data: \n", df_data.head(5))
print("\n df_price: \n", df_price.head(5))


 df_data: 
         Date   Data
0 2025-03-26  2.369
1 2025-03-25  2.365
2 2025-03-24  2.367
3 2025-03-21  2.386
4 2025-03-20  2.387

 df_price: 
         Date    Price
0 2025-03-26  5759.50
1 2025-03-25  5826.50
2 2025-03-24  5815.50
3 2025-03-21  5718.25
4 2025-03-20  5712.75


## 5. Merge Dataset

In [23]:
df = pd.merge(df_data, df_price, on='Date', how='inner').reset_index(drop=True)
df=df.rename(columns={
    'Data': 'Data_Value',
    'Price': 'StockPrice'
})
df.head()


Unnamed: 0,Date,Data_Value,StockPrice
0,2025-03-26,2.369,5759.5
1,2025-03-25,2.365,5826.5
2,2025-03-24,2.367,5815.5
3,2025-03-21,2.386,5718.25
4,2025-03-20,2.387,5712.75


## 4. Feature Engineering
- Create features from existing data

In [24]:
# Ensure datetime and correct order
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date", ascending=True).reset_index(drop=True)
df

Unnamed: 0,Date,Data_Value,StockPrice
0,2010-01-04,0.700,1178.00
1,2010-01-05,0.699,1181.50
2,2010-01-06,0.694,1182.25
3,2010-01-07,0.692,1186.75
4,2010-01-08,0.691,1190.75
...,...,...,...
3797,2025-03-20,2.387,5712.75
3798,2025-03-21,2.386,5718.25
3799,2025-03-24,2.367,5815.50
3800,2025-03-25,2.365,5826.50


#### 1. Log Returns (Target + Base Signal)

In [25]:
df["log_return"] = np.log(df["StockPrice"] / df["StockPrice"].shift(1))

# Target: next-day log return
df["target_log_return"] = df["log_return"].shift(-1)


#### 2. Lagged Returns

In [26]:
for lag in range(1, 11):
    df[f"log_return_lag_{lag}"] = df["log_return"].shift(lag)


#### 3. Rolling Return Statistics (Temporal Context)

In [27]:
windows = [5, 10, 20]

for w in windows:
    df[f"return_mean_{w}"] = (
        df["log_return"].rolling(w).mean().shift(1)
    )
    df[f"return_std_{w}"] = (
        df["log_return"].rolling(w).std().shift(1)
    )


#### 4. Price Trend (Shifted)

In [28]:
for w in [10, 20, 50]:
    df[f"price_ma_{w}"] = (
        df["StockPrice"].rolling(w).mean().shift(1)
    )
    df[f"price_ma_ratio_{w}"] = (
        df["StockPrice"].shift(1) / df[f"price_ma_{w}"]
    )


### 5. Volatility Dynamics

In [29]:
ema_12 = df["StockPrice"].ewm(span=12, adjust=False).mean()
ema_26 = df["StockPrice"].ewm(span=26, adjust=False).mean()

df["MACD"] = ema_12 - ema_26
df["MACD_signal"] = df["MACD"].ewm(span=9, adjust=False).mean()


#### 6. External Signal (Lagged Only)

In [30]:
for lag in [1, 3, 5, 10]:
    df[f"data_lag_{lag}"] = df["Data_Value"].shift(lag)


## 9. Drop Empty Rows
- We do this to avoid forward/backward filling for price-derived features.

In [31]:
df = df.dropna().reset_index(drop=True)

## 10. Updated Feature Set

In [32]:
df.head(10)

Unnamed: 0,Date,Data_Value,StockPrice,log_return,target_log_return,log_return_lag_1,log_return_lag_2,log_return_lag_3,log_return_lag_4,log_return_lag_5,...,price_ma_20,price_ma_ratio_20,price_ma_50,price_ma_ratio_50,MACD,MACD_signal,data_lag_1,data_lag_3,data_lag_5,data_lag_10
0,2010-03-17,0.644,1215.0,0.005157,0.000206,0.007682,-0.000833,0.000625,0.003967,0.004403,...,1172.7125,1.03073,1160.837,1.041275,15.7831,11.373365,0.646,0.649,0.651,0.655
1,2010-03-18,0.643,1215.25,0.000206,-0.004123,0.005157,0.007682,-0.000833,0.000625,0.003967,...,1176.025,1.033141,1161.577,1.045992,16.494346,12.397561,0.644,0.646,0.65,0.654
2,2010-03-19,0.642,1210.25,-0.004123,0.00474,0.000206,0.005157,0.007682,-0.000833,0.000625,...,1179.05,1.030703,1162.252,1.045599,16.464759,13.211001,0.643,0.646,0.649,0.653
3,2010-03-22,0.639,1216.0,0.00474,0.006149,-0.004123,0.000206,0.005157,0.007682,-0.000833,...,1181.7875,1.024084,1162.812,1.040796,16.712635,13.911328,0.642,0.644,0.646,0.652
4,2010-03-23,0.637,1223.5,0.006149,-0.004095,0.00474,-0.004123,0.000206,0.005157,0.007682,...,1184.75,1.026377,1163.397,1.045215,17.314674,14.591997,0.639,0.643,0.646,0.652
5,2010-03-24,0.636,1218.5,-0.004095,-0.001437,0.006149,0.00474,-0.004123,0.000206,0.005157,...,1188.6,1.029362,1164.052,1.05107,17.190178,15.111633,0.637,0.642,0.644,0.651
6,2010-03-25,0.635,1216.75,-0.001437,0.000616,-0.004095,0.006149,0.00474,-0.004123,0.000206,...,1191.8875,1.022328,1164.587,1.046294,16.757137,15.440734,0.636,0.639,0.643,0.65
7,2010-03-26,0.636,1217.5,0.000616,0.004303,-0.001437,-0.004095,0.006149,0.00474,-0.004123,...,1195.15,1.018073,1165.257,1.04419,16.286725,15.609932,0.635,0.637,0.642,0.649
8,2010-03-29,0.635,1222.75,0.004303,0.000613,0.000616,-0.001437,-0.004095,0.006149,0.00474,...,1198.3875,1.015949,1165.792,1.044354,16.151369,15.71822,0.636,0.636,0.639,0.646
9,2010-03-30,0.635,1223.5,0.000613,-0.00348,0.004303,0.000616,-0.001437,-0.004095,0.006149,...,1201.3375,1.017824,1166.357,1.04835,15.921088,15.758793,0.635,0.635,0.637,0.646


In [33]:
df.columns

Index(['Date', 'Data_Value', 'StockPrice', 'log_return', 'target_log_return',
       'log_return_lag_1', 'log_return_lag_2', 'log_return_lag_3',
       'log_return_lag_4', 'log_return_lag_5', 'log_return_lag_6',
       'log_return_lag_7', 'log_return_lag_8', 'log_return_lag_9',
       'log_return_lag_10', 'return_mean_5', 'return_std_5', 'return_mean_10',
       'return_std_10', 'return_mean_20', 'return_std_20', 'price_ma_10',
       'price_ma_ratio_10', 'price_ma_20', 'price_ma_ratio_20', 'price_ma_50',
       'price_ma_ratio_50', 'MACD', 'MACD_signal', 'data_lag_1', 'data_lag_3',
       'data_lag_5', 'data_lag_10'],
      dtype='object')

## 11. Save Feature Dataset

In [34]:
features_path = DATA_INTERIM_DIR / "features_dataset.csv"
df.to_csv(features_path)

features_path


WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim/features_dataset.csv')