In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime, timedelta
from utils import prints
import qlib
from qlib.data import D


In [2]:
# initialize qlib
qlib.init(provider_uri="C:/Users/harve/.qlib/qlib_data/us_data", region="us")

# ‚úÖ Load trained model
with open("trained_model_2.pkl", "rb") as f:
    obj = pickle.load(f)
model = obj["model"]
training_columns = obj["columns"]


[32032:MainThread](2025-12-28 18:05:48,728) INFO - qlib.Initialization - [config.py:452] - default_conf: client.
[32032:MainThread](2025-12-28 18:05:49,503) INFO - qlib.Initialization - [__init__.py:79] - qlib successfully initialized based on client settings.
[32032:MainThread](2025-12-28 18:05:49,504) INFO - qlib.Initialization - [__init__.py:81] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/harve/.qlib/qlib_data/us_data')}


In [3]:

# ‚úÖ Define date range and instruments
START_DATE = (datetime.today() - timedelta(days=30)).strftime("%Y-%m-%d") #START_DATE = "2025-01-01"
END_DATE = (datetime.today() + timedelta(days=1)).strftime("%Y-%m-%d")    #END_DATE = "2030-10-18"
prints(f"Using date range: {START_DATE} to {END_DATE}", "trade_list_ensemble_log.txt")
prints(f"Using training columns: {training_columns}", "trade_list_ensemble_log.txt")


Using date range: 2025-11-28 to 2025-12-29
Using training columns: ['$open', '$high', '$low', '$close', '$vol_5d', '$rank_vol_5d', '$volume_log']


In [8]:
instrument_path = r"C:/Users/harve/.qlib/qlib_data/us_data/instruments/all.txt"
with open(instrument_path, "r") as f:
    instrumentx = [line.strip().split("\t")[0] for line in f if line.strip()]
# ‚úÖ Load features and realized returns
raw_fields = ["$open", "$high", "$low", "$close", "$volume",
            "$ret_5d", "$vol_5d", "$rank_vol_5d", #"$rank_ret_5d",
            "$ret_10d", "$vol_10d", "$rank_ret_10d", "$rank_vol_10d",
            "$ret_20d", "$vol_20d", "$rank_ret_20d", "$rank_vol_20d", "$days_since_ipo"]

features = D.features(instruments=instrumentx, fields=raw_fields,
                    start_time=START_DATE, end_time=END_DATE)

In [9]:
features2 = features.copy()

In [17]:
features = features2.copy()

In [None]:
features["$volume_log"] = np.log1p(features["$volume"])
features.drop(columns=["$volume"], inplace=True)

diagnostic_cols = ["$ret_5d", "$days_since_ipo"]  # add more if needed
cols_to_keep = training_columns + [c for c in diagnostic_cols if c in features.columns]

features = features[cols_to_keep]

# Slice only training columns for prediction
X_for_model = features[training_columns]

labels = D.features(instruments=instrumentx,
                    fields=["$ret_5d", "$ret_10d", "$ret_20d"],
                    start_time=START_DATE, end_time=END_DATE)


# ‚úÖ Score features
# features = features.dropna(subset=training_columns)
features["score"] = model.predict(X_for_model)

df_combined = pd.concat([features, labels], axis=1)
df_combined = df_combined.dropna(subset=["score"])

In [21]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 39 entries, ('AAPL', Timestamp('2025-11-28 00:00:00')) to ('WMT', Timestamp('2025-11-28 00:00:00'))
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   $open            39 non-null     float32
 1   $high            39 non-null     float32
 2   $low             39 non-null     float32
 3   $close           39 non-null     float32
 4   $vol_5d          39 non-null     float32
 5   $rank_vol_5d     39 non-null     float32
 6   $volume_log      39 non-null     float32
 7   $ret_5d          0 non-null      float32
 8   $days_since_ipo  39 non-null     float32
 9   score            39 non-null     float64
 10  $ret_5d          0 non-null      float32
 11  $ret_10d         0 non-null      float32
 12  $ret_20d         0 non-null      float32
dtypes: float32(12), float64(1)
memory usage: 3.8+ KB


In [7]:
labels

Unnamed: 0_level_0,Unnamed: 1_level_0,$ret_5d,$ret_10d,$ret_20d
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,2025-10-28,0.002894,0.023234,0.029628
AAPL,2025-10-29,0.008473,0.021894,0.037140
AAPL,2025-10-30,-0.008211,0.004453,
AAPL,2025-10-31,-0.007989,0.007545,
AAPL,2025-11-03,0.005899,-0.001456,
...,...,...,...,...
WMT,2025-11-20,,,
WMT,2025-11-21,,,
WMT,2025-11-24,,,
WMT,2025-11-25,,,


In [None]:
# ‚úÖ Attribution function
def attribution(cohort, horizon="5"):
    col = f"$ret_{horizon}d"
    rr = cohort[col].dropna().values.mean()
    vol = cohort[col].dropna().values.std()
    rr_ratio = rr / vol if vol > 0 else 0
    return rr, vol, rr_ratio



In [None]:
# ‚úÖ Trade list loop
latest_date = features.index.get_level_values("datetime").max()
trade_dates = df_combined.index.get_level_values("datetime").unique()

if latest_date not in trade_dates:
    trade_dates = trade_dates.append(pd.Index([latest_date]))


In [None]:
# LOOP
for date in trade_dates.sort_values():

    if date not in df_combined.index.get_level_values("datetime"):
        continue

    df_day = df_combined.xs(date, level="datetime", drop_level=False)
    if df_day.empty or df_day.shape[0] < 4:
        continue

    df_day_sorted = df_day.sort_values("score", ascending=False)
    top = df_day_sorted.head(2)
    bottom = df_day_sorted.tail(2)

    prints(f"\nüìÖ {date.date()} ‚Äî Trade List (Ensemble Model)", "trade_list_ensemble_log.txt")
    for horizon in ["5", "10", "20"]:
        col = f"$ret_{horizon}d"
        col_data = df_day[col]
        if isinstance(col_data, pd.DataFrame):
            col_data = col_data.iloc[:, 0]  # Take the first column explicitly
        if col_data.isna().all():
            continue
        buy_rr, buy_vol, buy_rrr = attribution(top, horizon)
        sell_rr, _, _ = attribution(bottom, horizon)
        spread = buy_rr - sell_rr
        prints(f"  ‚è± {horizon}d ‚Üí Buy: {buy_rr:.2%}, Sell: {sell_rr:.2%}, Spread: {spread:.2%}, Vol: {buy_vol:.2%}, R/R: {buy_rrr:.2f}", "trade_list_ensemble_log.txt")

    #prints(f"  Buys: {top.index.get_level_values('instrument').tolist()} ‚Äî Scores: {top['score'].tolist()}", "trade_list_ensemble_log.txt")
    #prints(f"  Sells: {bottom.index.get_level_values('instrument').tolist()} ‚Äî Scores: {bottom['score'].tolist()}", "trade_list_ensemble_log.txt")
    def icon_score(score, is_buy=True):
        if is_buy and score >= 0.2:
            return f"{score:.4f} ‚úÖ"  # Strong buy
        elif is_buy and score >= 0.15:
            return f"{score:.4f} ‚úîÔ∏è"  # buy
        elif not is_buy and score <= -0.2:
            return f"{score:.4f} ‚ùå"  # Strong sell
        else:
            return score     # Neutral

    # Print buys with ‚úÖ for strong buy
    buy_instruments = top.index.get_level_values("instrument").tolist()
    buy_scores = [icon_score(s, is_buy=True) for s in top["score"].tolist()]
    prints(f"  Buys: {buy_instruments} ‚Äî Scores: {buy_scores}", "trade_list_ensemble_log.txt")

    # Print sells with ‚ùå for strong sell
    sell_instruments = bottom.index.get_level_values("instrument").tolist()
    sell_scores = [icon_score(s, is_buy=False) for s in bottom["score"].tolist()]
    prints(f"  Sells: {sell_instruments} ‚Äî Scores: {sell_scores}", "trade_list_ensemble_log.txt")


In [23]:
#diagnostic:
labels_renamed = labels.rename(columns={
    "$ret_5d": "ret_5d_label",
    "$ret_10d": "ret_10d_label",
    "$ret_20d": "ret_20d_label",
})

df_combined = pd.concat([features, labels_renamed], axis=1)
# Step 1: Reset index for easier slicing
df_valid = df_combined.reset_index()

# Step 2: Filter for dates with valid realized returns
df_valid = df_valid[df_valid["$ret_5d"].notna()]
# Keep forward returns for evaluation diagnostics

# Step 3: Rename columns for clarity (optional)
df_valid = df_valid.rename(columns={"score": "score", "$ret_5d": "label"})

# Step 4: Create score buckets
df_valid["bucket"] = pd.qcut(df_valid["score"], q=5, labels=False)

# Step 5: Attribution by bucket
bucket_returns = df_valid.groupby("bucket")["label"].mean()
prints("üìä Average 5d return per score bucket:", "trade_list_ensemble_log.txt")
prints(bucket_returns, "trade_list_ensemble_log.txt")
hit_rate = df_valid.groupby("bucket")["label"].apply(lambda x: (x > 0).mean())
prints("‚úÖ Hit rate per bucket:", "trade_list_ensemble_log.txt")
prints(hit_rate, "trade_list_ensemble_log.txt")


IPO_CUTOFF = 600  # ~1 year of trading days

df_valid["ipo_cohort"] = (df_valid["$days_since_ipo"] < IPO_CUTOFF).astype(int)

# Attribution by cohort
for cohort, name in [(0, "Core (>= 600 days)"), (1, "IPO (< 600 days)")]:
    df_c = df_valid[df_valid["ipo_cohort"] == cohort]
    prints(f"\nüìä {name} Attribution", "trade_list_ensemble_log.txt")
    prints(df_c.groupby("bucket")["label"].mean(), "trade_list_ensemble_log.txt")
    prints(df_c.groupby("bucket")["label"].apply(lambda x: (x > 0).mean()), "trade_list_ensemble_log.txt")


üìä Average 5d return per score bucket:
bucket
0   -0.022294
1   -0.008937
2   -0.016779
3   -0.015058
4   -0.019760
Name: label, dtype: float32
‚úÖ Hit rate per bucket:
bucket
0    0.361702
1    0.414286
2    0.357143
3    0.385714
4    0.361702
Name: label, dtype: float64

üìä Core (>= 600 days) Attribution
bucket
0   -0.019079
1   -0.008937
2   -0.016825
3   -0.014767
4   -0.020121
Name: label, dtype: float32
bucket
0    0.377778
1    0.414286
2    0.355072
3    0.388489
4    0.358779
Name: label, dtype: float64

üìä IPO (< 600 days) Attribution
bucket
0   -0.094621
2   -0.013645
3   -0.055447
4   -0.015044
Name: label, dtype: float32
bucket
0    0.0
2    0.5
3    0.0
4    0.4
Name: label, dtype: float64
