In [41]:
import yfinance as yf
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

import pandas_ta
from statsmodels.regression.rolling import RollingOLS
import statsmodels.api as sm

# 1) DATA INGESTION & BASE FEATURES
def fetch_monthly_spy(start, end):
    df = yf.download("SPY", start=start, end=end, progress=False)
    df = df.resample("ME").last()
    return df


# 2) TECHNICAL INDICATORS (no leakage)
def add_technical_features(df):
    df = df.copy()
    df["rsi"] = pandas_ta.rsi(df["Close"], length=20)
    df["atr"] = pandas_ta.atr(high=df["Close"], low=df["Close"], close=df["Close"], length=14)
    df["atr_z"] = (df["atr"] - df["atr"].rolling(100).mean())/df["atr"].rolling(100).std()
    df["macd"] = pandas_ta.macd(df["Close"], length=20).iloc[:,0]
    return df.dropna()

# 3) FACTOR BETAS (rolling, causal)
def add_rolling_betas(df, factors):
    # factors: DataFrame indexed monthly matching df.index
    merged = df.join(factors, how="inner")
    endog = merged["close"].pct_change().shift(-1)  # next-period return
    exog  = sm.add_constant(merged.drop("close", axis=1))
    rols = RollingOLS(endog=endog, exog=exog, window=24, min_nobs=exog.shape[1]+1)
    betas = rols.fit(params_only=True).params.drop("const", axis=1)
    betas = betas.shift(0).dropna()
    return df.join(betas, how="inner")

# 4) CLUSTERING (scaled, unsupervised)
def assign_clusters(df, n_clusters=4):
    features = df[["rsi","atr_z","macd"]]
    scaler   = StandardScaler()
    X_scaled = scaler.fit_transform(features)
    km       = KMeans(n_clusters=n_clusters, random_state=0)
    df["cluster"] = km.fit_predict(X_scaled)
    return df, scaler, km

# 5) LABEL GENERATION (buy if next‐month return > 0)
def generate_labels(df):
    df = df.copy()
    df["return_1m"] = df["close"].pct_change().shift(-1)
    df["signal"] = (df["return_1m"] > 0).astype(int)
    return df.dropna()

# 6) SUPERVISED MODEL (cluster + tech features → buy signal)
def train_classifier(df):
    X = df[["rsi","atr_z","macd","cluster"]]
    y = df["signal"]
    tscv = TimeSeriesSplit(n_splits=5)
    clf  = RandomForestClassifier(n_estimators=100, random_state=0)
    scores = cross_val_score(clf, X, y, cv=tscv, scoring="accuracy")
    print("CV Accuracy:", scores.mean())
    clf.fit(X, y)
    return clf

# 7) PUTTING IT ALL TOGETHER
start, end = "2017-01-01", "2025-05-01"
spy        = fetch_monthly_spy(start, end)
tech       = add_technical_features(spy)
# assume factor_df is your Fama-French 5 factors monthly, pre‐fetched
# factor_df = ...

df1        = add_rolling_betas(tech, factor_df)
df2        = generate_labels(df1)
df3, scaler, km = assign_clusters(df2)
clf        = train_classifier(df3)

# 8) USAGE: predict next signal
latest = df3.iloc[[-1]][["rsi","atr_z","macd","cluster"]]
signal = clf.predict(latest)[0]
print("Next‐month buy signal?" , "BUY" if signal else "NO BUY")


AttributeError: 'NoneType' object has no attribute 'shift'