In [1]:
import pandas as pd
from pathlib import Path

data_path = Path("../data/processed/IYR_cleaned_project.csv")
df = pd.read_csv(data_path)

# ensure Date is datetime and sorted
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df.sort_values("Date").reset_index(drop=True)

df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,daily_return
0,2022-01-04,104.979668,105.9834704567302,104.84278388218968,105.47244509923952,9363000,-0.001909
1,2022-01-05,101.867874,104.8610416475952,101.79487549400208,104.71503042018672,12110000,-0.029642
2,2022-01-06,101.849625,102.32414719106455,100.84582182871512,101.96825353255834,7920600,-0.000179
3,2022-01-07,101.165215,101.86787442636896,100.88232648743846,101.5667384528561,7883300,-0.00672
4,2022-01-10,100.65419,100.68156542838568,99.33098940788028,100.60855981382896,10109200,-0.005051


In [2]:
price_col = "Adj Close" if "Adj Close" in df.columns else ("Close" if "Close" in df.columns else None)
ret_col   = "daily_return" if "daily_return" in df.columns else None
(price_col, ret_col)

('Close', 'daily_return')

In [3]:
out = df.copy()

# lags (on returns)
if ret_col:
    out["ret_lag1"] = out[ret_col].shift(1)
    out["ret_lag5"] = out[ret_col].shift(5)

# rolling means of price
if price_col:
    for w in (7, 21, 63, 126):  # ~1w, 1m, 3m, 6m (trading days)
        out[f"ma_{w}"] = out[price_col].rolling(w, min_periods=1).mean()

# rolling volatility of returns
if ret_col:
    for w in (7, 21, 63):
        out[f"vol_{w}"] = out[ret_col].rolling(w, min_periods=2).std()

# momentum (price vs. N days ago)
if price_col:
    for w in (10, 20, 60):
        out[f"mom_{w}"] = out[price_col] / out[price_col].shift(w) - 1

# cumulative return from first non-NA ret
if ret_col:
    out["cum_return"] = (1 + out[ret_col].fillna(0)).cumprod() - 1

out.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,daily_return,ret_lag1,ret_lag5,ma_7,ma_21,ma_63,ma_126,vol_7,vol_21,vol_63,mom_10,mom_20,mom_60,cum_return
0,2022-01-04,104.979668,105.9834704567302,104.84278388218968,105.47244509923952,9363000,-0.001909,,,104.979668,104.979668,104.979668,104.979668,,,,,,,-0.001909
1,2022-01-05,101.867874,104.8610416475952,101.79487549400208,104.71503042018672,12110000,-0.029642,-0.001909,,103.423771,103.423771,103.423771,103.423771,0.01961,0.01961,0.01961,,,,-0.031494
2,2022-01-06,101.849625,102.32414719106455,100.84582182871512,101.96825353255834,7920600,-0.000179,-0.029642,,102.899055,102.899055,102.899055,102.899055,0.016534,0.016534,0.016534,,,,-0.031668
3,2022-01-07,101.165215,101.86787442636896,100.88232648743846,101.5667384528561,7883300,-0.00672,-0.000179,,102.465595,102.465595,102.465595,102.465595,0.013637,0.013637,0.013637,,,,-0.038175
4,2022-01-10,100.65419,100.68156542838568,99.33098940788028,100.60855981382896,10109200,-0.005051,-0.00672,,102.103314,102.103314,102.103314,102.103314,0.011985,0.011985,0.011985,,,,-0.043033


In [4]:
if price_col:
    out["ema_12"] = out[price_col].ewm(span=12, adjust=False).mean()
    out["ema_26"] = out[price_col].ewm(span=26, adjust=False).mean()

    delta = out[price_col].diff()
    gain = delta.clip(lower=0).rolling(14, min_periods=14).mean()
    loss = (-delta.clip(upper=0)).rolling(14, min_periods=14).mean()
    rs = gain / loss
    out["rsi_14"] = 100 - (100 / (1 + rs))

out.tail()

Unnamed: 0,Date,Close,High,Low,Open,Volume,daily_return,ret_lag1,ret_lag5,ma_7,...,vol_7,vol_21,vol_63,mom_10,mom_20,mom_60,cum_return,ema_12,ema_26,rsi_14
492,2023-12-26,88.390144,88.59307262125922,87.60741995577247,87.7813521562928,3094100,0.007601,0.003316,-0.001537,87.484281,...,0.009408,0.012048,0.01319,0.066511,0.113368,0.184419,-0.180328,86.280391,83.644497,74.142876
493,2023-12-27,88.853973,88.86363874566969,88.09057975351183,88.5544136743051,5522900,0.005248,0.007601,0.006706,87.705578,...,0.007725,0.012042,0.012942,0.071364,0.11263,0.186673,-0.176026,86.676327,84.030384,77.570097
494,2023-12-28,89.346802,89.35646711463478,88.59306612083176,88.65104351675117,4143700,0.005546,0.005248,-0.013027,88.016445,...,0.007564,0.012041,0.012895,0.038549,0.110672,0.215657,-0.171456,87.087169,84.424193,77.985995
495,2023-12-29,88.332161,89.12455064580271,88.24518748284522,88.82498356273108,4854400,-0.011356,0.005546,0.009484,88.098858,...,0.009211,0.012573,0.01297,-0.000845,0.089751,0.225662,-0.180865,87.278706,84.713672,72.703995
496,NaT,,IYR,IYR,IYR,IYR,0.0,-0.011356,0.003316,88.346654,...,0.006949,0.012589,0.012971,,,,-0.180865,87.278706,84.713672,


In [5]:
save_path = Path("../data/processed/IYR_features_project.csv")
out.to_csv(save_path, index=False)
save_path

PosixPath('../data/processed/IYR_features_project.csv')

In [6]:
import os
save_path, os.path.exists(save_path), out.shape, out.columns[-10:].tolist()

(PosixPath('../data/processed/IYR_features_project.csv'),
 True,
 (497, 23),
 ['vol_7',
  'vol_21',
  'vol_63',
  'mom_10',
  'mom_20',
  'mom_60',
  'cum_return',
  'ema_12',
  'ema_26',
  'rsi_14'])