In [2]:
import yfinance as yf
import pandas as pd
import numpy as np

# ---------- 1) Sector ETFs + SPY ----------
tickers = ["SPY","XLK","XLF","XLE","XLV","XLU","XLY","XLP","XLB","XLRE","XLC"]

data = yf.download(tickers, start="2015-01-01", end="2025-06-01")

# Flatten multi-index if needed
if isinstance(data.columns, pd.MultiIndex):
    data.columns = ['_'.join(col).strip() for col in data.columns.values]

# Prefer adjusted close if available
if any("Adj Close" in c for c in data.columns):
    prices = data[[c for c in data.columns if "Adj Close" in c]]
    prices.columns = [c.replace("Adj Close_", "") for c in prices.columns]
else:
    prices = data[[c for c in data.columns if "Close" in c]]
    prices.columns = [c.replace("Close_", "") for c in prices.columns]

print("Prices shape:", prices.shape)
returns = prices.pct_change().dropna()

# ---------- 2) Relative strength ratios ----------
ratios = pd.DataFrame({
    "XLK_SPY": prices["XLK"] / prices["SPY"],
    "XLU_SPY": prices["XLU"] / prices["SPY"],
    "XLF_SPY": prices["XLF"] / prices["SPY"]
})

# ---------- 3) Sector growth proxy ----------
sector_growth = returns.rolling(60).mean()

# ---------- 4) Seasonality ----------
monthly_returns = prices["SPY"].resample("M").last().pct_change()
seasonality = monthly_returns.groupby(monthly_returns.index.month).mean()

# ---------- 5) Breadth ----------
rel_perf = prices.div(prices["SPY"], axis=0)
breadth = (rel_perf.pct_change(60) > 0).sum(axis=1)

# ---------- 6) Volatility ----------
volatility = returns.rolling(20).std()
vol_ratio = (volatility["XLK"] / volatility["XLU"]).rename("Vol_Ratio_XLK_XLU")

# ---------- 7) Treasury yields via Yahoo proxies ----------
yields_raw = yf.download(["^TNX","^FVX"], start="2015-01-01", end="2025-06-01")

# Flatten if multi-index
if isinstance(yields_raw.columns, pd.MultiIndex):
    yields_raw.columns = ['_'.join(col).strip() for col in yields_raw.columns.values]

# Prefer Adj Close if present, else Close
if any("Adj Close" in c for c in yields_raw.columns):
    yields = yields_raw[[c for c in yields_raw.columns if "Adj Close" in c]]
    yields.columns = [c.replace("Adj Close_", "") for c in yields.columns]
else:
    yields = yields_raw[[c for c in yields_raw.columns if "Close" in c]]
    yields.columns = [c.replace("Close_", "") for c in yields.columns]

# Convert Yahoo yields (quoted ×10) to percent
yields = yields / 10.0
yields = yields.rename(columns={"^TNX":"DGS10_proxy","^FVX":"DGS5_proxy"})
yields["Spread"] = yields["DGS10_proxy"] - yields["DGS5_proxy"]

print("\nYield sample:")
print(yields.tail())

# ---------- 8) Commodities ----------
fut_prices = yf.download(["CL=F","HG=F"], start="2018-01-01", end="2023-12-31")

if isinstance(fut_prices.columns, pd.MultiIndex):
    fut_prices.columns = ['_'.join(col).strip() for col in fut_prices.columns.values]

if any("Adj Close" in c for c in fut_prices.columns):
    fut_prices = fut_prices[[c for c in fut_prices.columns if "Adj Close" in c]]
    fut_prices.columns = [c.replace("Adj Close_", "") for c in fut_prices.columns]
else:
    fut_prices = fut_prices[[c for c in fut_prices.columns if "Close" in c]]
    fut_prices.columns = [c.replace("Close_", "") for c in fut_prices.columns]

commodity_pressure = pd.DataFrame(index=fut_prices.index)
commodity_pressure["Crude_60dRet"] = fut_prices["CL=F"].pct_change(60)
commodity_pressure["Copper_60dRet"] = fut_prices["HG=F"].pct_change(60)

# ---------- 9) Defensive vs cyclical ratio ----------
defensive = prices["XLP"] + prices["XLU"] + prices["XLV"]
cyclical = prices["XLK"] + prices["XLF"] + prices["XLY"]
def_cyc_ratio = (defensive / cyclical).rolling(20).mean()

# ---------- 10) Cross-sector correlations ----------
avg_sector_corr = returns.rolling(60).corr().groupby(level=0).mean().mean(axis=1)

# ---------- Combine engineered features ----------
features = pd.DataFrame(index=prices.index)
features["SPY_Return"] = returns["SPY"]
features["GrowthMinusDef"] = (returns[["XLK","XLF","XLE","XLY"]].mean(axis=1) -
                              returns[["XLU","XLP","XLV"]].mean(axis=1))
features["XLK_SPY"] = ratios["XLK_SPY"]
features["XLU_SPY"] = ratios["XLU_SPY"]
features["XLF_SPY"] = ratios["XLF_SPY"]
features["Breadth"] = breadth
features["Vol_Ratio_XLK_XLU"] = vol_ratio
features["DGS10"] = yields["DGS10_proxy"].reindex(prices.index).ffill()
features["DGS5"] = yields["DGS5_proxy"].reindex(prices.index).ffill()
features["Spread"] = yields["Spread"].reindex(prices.index).ffill()
features["DefCyc_Ratio"] = def_cyc_ratio
features["Avg_Sector_Corr"] = avg_sector_corr.reindex(prices.index).ffill()
features["Crude_60dRet"] = commodity_pressure["Crude_60dRet"].reindex(prices.index).ffill()
features["Copper_60dRet"] = commodity_pressure["Copper_60dRet"].reindex(prices.index).ffill()

print("\n===== Engineered Features Sample =====")
print(features.tail())

# ---------- Z-scored features ----------
z = pd.DataFrame(index=features.index)
for col in features.columns:
    if features[col].isna().all():
        z[col] = np.nan
        continue
    mean = features[col].rolling(252).mean()
    std = features[col].rolling(252).std()
    mask = (std != 0) & ~std.isna()
    z[col] = np.nan
    z.loc[mask, col] = (features.loc[mask, col] - mean.loc[mask]) / std.loc[mask]

print("\n===== Z-scored Features Sample =====")
print(z.tail())

print("\n===== Seasonality Table =====")
print(seasonality)

# ---------- Combine everything into one master DataFrame ----------
all_data = pd.concat(
    {
        "Prices": prices,
        "Returns": returns,
        "Ratios": ratios,
        "Features": features,
        "ZScores": z
    },
    axis=1
)

print("\n===== Master Data Sample =====")
print(all_data.tail())

# ---------- Save to CSV ----------
all_data.to_csv("market_features_master.csv")


  data = yf.download(tickers, start="2015-01-01", end="2025-06-01")
[*********************100%***********************]  11 of 11 completed
  monthly_returns = prices["SPY"].resample("M").last().pct_change()
  return op(a, b)
  yields_raw = yf.download(["^TNX","^FVX"], start="2015-01-01", end="2025-06-01")


Prices shape: (2618, 11)


[*********************100%***********************]  2 of 2 completed
  fut_prices = yf.download(["CL=F","HG=F"], start="2018-01-01", end="2023-12-31")
[*********************100%***********************]  2 of 2 completed



Yield sample:
            DGS5_proxy  DGS10_proxy  Spread
Date                                       
2025-05-23      0.4078       0.4509  0.0431
2025-05-27      0.4023       0.4434  0.0411
2025-05-28      0.4065       0.4477  0.0412
2025-05-29      0.3998       0.4424  0.0426
2025-05-30      0.3979       0.4416  0.0437


  commodity_pressure["Crude_60dRet"] = fut_prices["CL=F"].pct_change(60)
  commodity_pressure["Copper_60dRet"] = fut_prices["HG=F"].pct_change(60)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()



===== Engineered Features Sample =====
            SPY_Return  GrowthMinusDef   XLK_SPY   XLU_SPY   XLF_SPY  Breadth  \
Date                                                                            
2025-05-23   -0.006826       -0.009794  0.392743  0.138871  0.086340        5   
2025-05-27    0.020791        0.009698  0.393887  0.137066  0.086069        4   
2025-05-28   -0.005785        0.000464  0.394541  0.135940  0.085992        4   
2025-05-29    0.003947       -0.002215  0.393653  0.136347  0.086094        3   
2025-05-30   -0.001118       -0.010855  0.392869  0.137896  0.086377        4   

            Vol_Ratio_XLK_XLU   DGS10    DGS5  Spread  DefCyc_Ratio  \
Date                                                                  
2025-05-23           1.420650  0.4509  0.4078  0.0431      0.613411   
2025-05-27           1.474811  0.4434  0.4023  0.0411      0.610244   
2025-05-28           1.417354  0.4477  0.4065  0.0412      0.607026   
2025-05-29           1.420646  0.4424