In [17]:
import pandas as pd
import numpy as np
import os
import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

from lightgbm import LGBMRanker
from pathlib import Path   
from catboost import CatBoostRegressor, Pool

pd.set_option("display.max_columns", 100)

In [18]:
DATA_DAILY = Path("data/daily")
HOT_SCORE_OUT = Path("data/hotscore")
OUTPUT_DIR = Path("output/trend")

for p in (DATA_DAILY, HOT_SCORE_OUT, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [19]:
def extract_timestamp(filename):
    ts = re.search(r'_(\d{14})', filename).group(1)
    return pd.to_datetime(ts, format='%Y%m%d%H%M%S')

In [20]:
hot_files = sorted(glob.glob(str(DATA_DAILY / "hot_stocks_*.csv")))

hot_dfs = []
for f in hot_files:
    df = pd.read_csv(f)
    df['snapshot_time'] = extract_timestamp(f)
    hot_dfs.append(df)

hot_data = pd.concat(hot_dfs, ignore_index=True)

print(hot_data.shape)

(40172, 13)


In [21]:
score_files = sorted(glob.glob(str(HOT_SCORE_OUT / "hotscore_*.csv")))

score_dfs = []
for f in score_files:
    df = pd.read_csv(f)
    df['score_time'] = extract_timestamp(f)
    score_dfs.append(df)

score_data = pd.concat(score_dfs, ignore_index=True)

print(score_data.shape)

(12682868, 9)


In [15]:
hot_data = hot_data.sort_values(['snapshot_time', 'symbol']).reset_index(drop=True)
score_data = score_data.sort_values(['score_time', 'symbol']).reset_index(drop=True)

merged = pd.merge_asof(
    hot_data,
    score_data,
    by='symbol',
    left_on='snapshot_time',
    right_on='score_time',
    direction='forward'
)
 
print(merged.shape)

(40172, 21)


In [22]:
# 1. Rename snapshot feature columns (_x â†’ clean names)
merged = merged.rename(columns={
    'regularMarketPrice_x': 'regularMarketPrice',
    'regularMarketChangePercent_x': 'regularMarketChangePercent',
    'VolumeSpike_x': 'VolumeSpike',
    'averageDailyVolume3Month_x': 'averageDailyVolume3Month',
    'marketCap_x': 'marketCap',
    'HotScore_y': 'HotScore_future'
})

# 2. Drop leaky / duplicated columns
drop_cols = [
    'HotScore_x',
    'regularMarketPrice_y',
    'regularMarketChangePercent_y',
    'VolumeSpike_y',
    'averageDailyVolume3Month_y',
    'marketCap_y',
    'date'
]

merged = merged.drop(columns=[c for c in drop_cols if c in merged.columns])

# 3. Drop rows without a future label
merged = merged.dropna(subset=['HotScore_future'])
merged.head(4)

Unnamed: 0,symbol,regularMarketPrice,regularMarketChangePercent,regularMarketVolume,averageDailyVolume3Month,marketCap,VolumeSpike,MomentumScore,VolumeScore,VolatilityScore,TrendScore,snapshot_time,HotScore_future,score_time
0,AA,41.845,6.74745,6326454.0,6727448.0,10836350000.0,0.940394,0.903646,0.802083,0.726562,0.520833,2025-11-26 20:46:26,0.794401,2025-11-26 20:46:38
1,AAUC,19.18,9.788214,360438.0,342331.0,2378320000.0,1.052893,0.96875,0.848958,0.622396,0.854167,2025-11-26 20:46:26,0.846094,2025-11-26 20:46:38
2,ALAB,156.16,7.860205,4000639.0,6266829.0,26375990000.0,0.638383,0.942708,0.518229,0.947917,0.723958,2025-11-26 20:46:26,0.773307,2025-11-26 20:46:38
3,ANF,96.205,6.610155,3899150.0,2070173.0,4583525000.0,1.88349,0.898438,0.958333,0.864583,0.828125,2025-11-26 20:46:26,0.905599,2025-11-26 20:46:38


In [23]:
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "regularMarketVolume",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore"
]

X = merged[features].replace([np.inf, -np.inf], np.nan).fillna(0)
y = merged["HotScore_future"]

# %%
# --- Train CatBoost Regressor ---
cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    verbose=50
)

cat_model.fit(X, y)

0:	learn: 0.1513220	total: 163ms	remaining: 1m 21s
50:	learn: 0.1135353	total: 494ms	remaining: 4.35s
100:	learn: 0.1037218	total: 764ms	remaining: 3.02s
150:	learn: 0.0959973	total: 1.01s	remaining: 2.34s
200:	learn: 0.0898411	total: 1.28s	remaining: 1.91s
250:	learn: 0.0843120	total: 1.53s	remaining: 1.52s
300:	learn: 0.0799951	total: 1.79s	remaining: 1.19s
350:	learn: 0.0762538	total: 2.06s	remaining: 873ms
400:	learn: 0.0727134	total: 2.41s	remaining: 595ms
450:	learn: 0.0697064	total: 2.93s	remaining: 318ms
499:	learn: 0.0671511	total: 3.5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1d2b24b19a0>

In [24]:
feature_importance = pd.Series(
    cat_model.get_feature_importance(),
    index=features
).sort_values(ascending=False)
feature_importance

averageDailyVolume3Month      34.269794
regularMarketPrice            23.370489
marketCap                     19.933917
VolatilityScore                4.659182
VolumeScore                    4.317265
regularMarketChangePercent     3.902355
VolumeSpike                    3.193683
TrendScore                     2.815228
MomentumScore                  2.172333
regularMarketVolume            1.365753
dtype: float64

In [None]:
fig_pie = px.pie(
    names=feature_importance.index,
    values=feature_importance.values,
    title="Feature Importance Contribution (Normalized)"
)
fig_pie.update_traces(textinfo='percent+label', pull=[0.05]*len(feature_importance))
fig_pie.update_layout(template="plotly_dark")

chart_path = os.path.join(OUTPUT_DIR, f"catboost_feature_importance_pie.html")
fig_pie.write_html(chart_path, include_plotlyjs='cdn')

In [None]:
fig_line = go.Figure()
fig_line.add_trace(go.Scatter(
    x=feature_importance.index,
    y=feature_importance.values,
    mode='lines+markers',
    line=dict(width=2, color='lime'),
    marker=dict(size=8, color='cyan')
))
fig_line.update_layout(
    title="Feature Importance Trend",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_dark"
)

chart_path = os.path.join(OUTPUT_DIR, f"catboost_feature_importance_line.html")
fig_pie.write_html(chart_path, include_plotlyjs='cdn')