## 1. Load Necessary Libraries

In [65]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
import torch
import datetime as dt
from sklearn.preprocessing import StandardScaler

## 2. Set seed for reproducibility

In [66]:
# -----------------------------
# Reproducibility
# -----------------------------
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    print("CUDA is available. Setting seed for all GPUs.")
    torch.cuda.manual_seed_all(RANDOM_SEED)

sns.set_style("darkgrid")


CUDA is available. Setting seed for all GPUs.


## 3. Set paths

In [67]:
# -----------------------------
# Project Root Resolution
# -----------------------------
PROJECT_ROOT = Path.cwd().resolve().parents[0]

DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"

DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

DATA_RAW_DIR, DATA_INTERIM_DIR, FIGURES_DIR


(WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/raw'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/reports/figures'))

## 4. Load Data from data/raw/

In [68]:
data_path = DATA_RAW_DIR / "Data.csv"
price_path = DATA_RAW_DIR / "StockPrice.csv"

df_data = pd.read_csv(data_path, parse_dates=["Date"])
df_price = pd.read_csv(price_path, parse_dates=["Date"])


In [69]:
print("\n df_data: \n", df_data.head(5))
print("\n df_price: \n", df_price.head(5))


 df_data: 
         Date   Data
0 2025-03-26  2.369
1 2025-03-25  2.365
2 2025-03-24  2.367
3 2025-03-21  2.386
4 2025-03-20  2.387

 df_price: 
         Date    Price
0 2025-03-26  5759.50
1 2025-03-25  5826.50
2 2025-03-24  5815.50
3 2025-03-21  5718.25
4 2025-03-20  5712.75


## 5. Merge Dataset

In [70]:
df = pd.merge(df_data, df_price, on='Date', how='inner')
df=df.rename(columns={
    'Data': 'Data_Value',
    'Price': 'StockPrice'
})
df.head()


Unnamed: 0,Date,Data_Value,StockPrice
0,2025-03-26,2.369,5759.5
1,2025-03-25,2.365,5826.5
2,2025-03-24,2.367,5815.5
3,2025-03-21,2.386,5718.25
4,2025-03-20,2.387,5712.75


## 8. Feature Engineering
- Create features from existing data

In [71]:
# Ensure datetime and correct order
df["Date"] = pd.to_datetime(df["Date"])

#### 1. Price Based Features

In [72]:
# Daily return
df["daily_return"] = df["StockPrice"].pct_change()

# Log return
df["log_return"] = np.log(df["StockPrice"] / df["StockPrice"].shift(1))

# Absolute price change
df["price_change"] = df["StockPrice"] - df["StockPrice"].shift(1)

# 7-day rolling volatility (std of daily returns)
df["volatility_7d"] = df["daily_return"].rolling(window=7).std()

#### 2. Moving Averages

In [73]:
df["MA_7"]  = df["StockPrice"].rolling(window=7).mean()
df["MA_30"] = df["StockPrice"].rolling(window=30).mean()
df["MA_50"] = df["StockPrice"].rolling(window=50).mean()

# Price to MA7 ratio
df["price_to_MA7_ratio"] = df["StockPrice"] / df["MA_7"]

#### 3. Temporal Features

In [74]:
df["day_of_week"]  = df["Date"].dt.dayofweek      # 0 = Monday
df["day_of_month"] = df["Date"].dt.day
df["month"]        = df["Date"].dt.month

#### 4.Momentum Indicators
RSI (14-Day)

In [75]:
delta = df["StockPrice"].diff()

gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain / avg_loss
df["RSI_14"] = 100 - (100 / (1 + rs))


MACD (12-26 EMA) & Signal Line (9 EMA)

In [76]:
ema_12 = df["StockPrice"].ewm(span=12, adjust=False).mean()
ema_26 = df["StockPrice"].ewm(span=26, adjust=False).mean()

df["MACD"] = ema_12 - ema_26
df["MACD_signal"] = df["MACD"].ewm(span=9, adjust=False).mean()


#### 5. Volatility Indicators (Bollinger Bands)

In [77]:
MA_20 = df["StockPrice"].rolling(window=20).mean()
STD_20 = df["StockPrice"].rolling(window=20).std()

df["bollinger_upper"] = MA_20 + 2 * STD_20
df["bollinger_lower"] = MA_20 - 2 * STD_20

#### 6. Trend Indicators

In [78]:
df["momentum_5d"]  = (df["StockPrice"] - df["StockPrice"].shift(5))  / df["StockPrice"].shift(5)
df["momentum_20d"] = (df["StockPrice"] - df["StockPrice"].shift(20)) / df["StockPrice"].shift(20)


#### 7. Lag Features (Price Lags)

In [79]:
for lag in range(1, 6):
    df[f"price_lag_{lag}"] = df["StockPrice"].shift(lag)

#### 8. Rolling Statistics

In [80]:
df["rolling_max_20d"] = df["StockPrice"].rolling(window=20).max()
df["rolling_min_20d"] = df["StockPrice"].rolling(window=20).min()


## 9. Drop Empty Rows (Created due to moving averages and other features)
- We do this to avoid forward/backward filling for price-derived features.

In [82]:
df = df.dropna().reset_index(drop=True)

## 10. Updated Feature Set

In [83]:
df.head(10)

Unnamed: 0,Date,Data_Value,StockPrice,daily_return,log_return,price_change,volatility_7d,MA_7,MA_30,MA_50,...,bollinger_lower,momentum_5d,momentum_20d,price_lag_1,price_lag_2,price_lag_3,price_lag_4,price_lag_5,rolling_max_20d,rolling_min_20d
0,2025-01-14,2.761,5882.25,-0.017824,-0.017985,-106.75,0.00718,6033.857143,6067.1,5940.145,...,5939.847927,-0.038927,-0.03137,5989.0,5975.5,6033.5,6084.25,6120.5,6152.0,5882.25
1,2025-01-13,2.785,5874.5,-0.001318,-0.001318,-7.75,0.006418,5994.214286,6063.891667,5942.445,...,5903.923835,-0.034474,-0.035742,5882.25,5989.0,5975.5,6033.5,6084.25,6152.0,5874.5
2,2025-01-10,2.766,5866.25,-0.001404,-0.001405,-8.25,0.006703,5957.892857,6060.433333,5943.24,...,5872.40857,-0.02772,-0.036543,5874.5,5882.25,5989.0,5975.5,6033.5,6152.0,5866.25
3,2025-01-09,2.789,5944.75,0.013382,0.013293,78.5,0.009941,5937.964286,6058.566667,5945.825,...,5861.877525,-0.005146,-0.017315,5866.25,5874.5,5882.25,5989.0,5975.5,6152.0,5866.25
4,2025-01-08,2.782,5959.25,0.002439,0.002436,14.5,0.009858,5927.357143,6056.241667,5950.645,...,5854.670147,-0.004967,-0.024034,5944.75,5866.25,5874.5,5882.25,5989.0,6152.0,5866.25
5,2025-01-07,2.785,5954.25,-0.000839,-0.000839,-5.0,0.009226,5924.321429,6050.166667,5955.475,...,5847.330252,0.01224,-0.021728,5959.25,5944.75,5866.25,5874.5,5882.25,6152.0,5866.25
6,2025-01-06,2.757,6020.5,0.011127,0.011065,66.25,0.010219,5928.821429,6045.416667,5961.31,...,5846.297045,0.024853,-0.00701,5954.25,5959.25,5944.75,5866.25,5874.5,6152.0,5866.25
7,2025-01-03,2.731,5989.5,-0.005149,-0.005162,-31.0,0.006979,5944.142857,6040.175,5967.735,...,5844.150971,0.02101,-0.005438,6020.5,5954.25,5959.25,5944.75,5866.25,6152.0,5866.25
8,2025-01-02,2.736,5916.5,-0.012188,-0.012263,-73.0,0.008933,5950.142857,6032.991667,5971.455,...,5832.602441,-0.004752,-0.024847,5989.5,6020.5,5954.25,5959.25,5944.75,6152.0,5866.25
9,2024-12-31,2.714,5935.75,0.003254,0.003248,19.25,0.008893,5960.071429,6026.341667,5976.365,...,5826.624907,-0.003943,-0.026807,5916.5,5989.5,6020.5,5954.25,5959.25,6152.0,5866.25


In [84]:
df.columns

Index(['Date', 'Data_Value', 'StockPrice', 'daily_return', 'log_return',
       'price_change', 'volatility_7d', 'MA_7', 'MA_30', 'MA_50',
       'price_to_MA7_ratio', 'day_of_week', 'day_of_month', 'month', 'RSI_14',
       'MACD', 'MACD_signal', 'bollinger_upper', 'bollinger_lower',
       'momentum_5d', 'momentum_20d', 'price_lag_1', 'price_lag_2',
       'price_lag_3', 'price_lag_4', 'price_lag_5', 'rolling_max_20d',
       'rolling_min_20d'],
      dtype='object')

## 11. Save Feature Dataset

In [86]:
features_path = DATA_INTERIM_DIR / "features_dataset.csv"
df.to_csv(features_path)

features_path


WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim/features_dataset.csv')