In [None]:
import pandas as pd
import numpy as np
import os
import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

from lightgbm import LGBMRanker
from pathlib import Path

pd.set_option("display.max_columns", 100)

In [2]:
DATA_DAILY = Path("data/daily")
HOT_SCORE_OUT = Path("data/hotscore")
OUTPUT_DIR = Path("output/ranking")

for p in (DATA_DAILY, HOT_SCORE_OUT, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [6]:
def extract_timestamp(filename):
    ts = re.search(r'_(\d{14})', filename).group(1)
    return pd.to_datetime(ts, format='%Y%m%d%H%M%S')

In [9]:
hot_files = sorted(glob.glob(str(DATA_DAILY / "hot_stocks_*.csv")))

hot_dfs = []
for f in hot_files:
    df = pd.read_csv(f)
    df['snapshot_time'] = extract_timestamp(f)
    hot_dfs.append(df)

hot_data = pd.concat(hot_dfs, ignore_index=True)

print(hot_data.shape)

(40172, 13)


In [10]:
score_files = sorted(glob.glob(str(HOT_SCORE_OUT / "hotscore_*.csv")))

score_dfs = []
for f in score_files:
    df = pd.read_csv(f)
    df['score_time'] = extract_timestamp(f)
    score_dfs.append(df)

score_data = pd.concat(score_dfs, ignore_index=True)

print(score_data.shape)

(12682868, 9)


In [11]:
hot_data = hot_data.sort_values(['snapshot_time', 'symbol']).reset_index(drop=True)
score_data = score_data.sort_values(['score_time', 'symbol']).reset_index(drop=True)

merged = pd.merge_asof(
    hot_data,
    score_data,
    by='symbol',
    left_on='snapshot_time',
    right_on='score_time',
    direction='forward'
)

merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40172 entries, 0 to 40171
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   symbol                        40172 non-null  object        
 1   regularMarketPrice_x          39973 non-null  float64       
 2   regularMarketChangePercent_x  39973 non-null  float64       
 3   regularMarketVolume           39973 non-null  float64       
 4   averageDailyVolume3Month_x    39973 non-null  float64       
 5   marketCap_x                   39973 non-null  float64       
 6   VolumeSpike_x                 39973 non-null  float64       
 7   MomentumScore                 39973 non-null  float64       
 8   VolumeScore                   39973 non-null  float64       
 9   VolatilityScore               39973 non-null  float64       
 10  TrendScore                    39973 non-null  float64       
 11  HotScore_x                  

In [13]:
# 1. Rename snapshot feature columns (_x → clean names)
merged = merged.rename(columns={
    'regularMarketPrice_x': 'regularMarketPrice',
    'regularMarketChangePercent_x': 'regularMarketChangePercent',
    'VolumeSpike_x': 'VolumeSpike',
    'averageDailyVolume3Month_x': 'averageDailyVolume3Month',
    'marketCap_x': 'marketCap',
    'HotScore_y': 'HotScore_future'
})

# 2. Drop leaky / duplicated columns
drop_cols = [
    'HotScore_x',
    'regularMarketPrice_y',
    'regularMarketChangePercent_y',
    'VolumeSpike_y',
    'averageDailyVolume3Month_y',
    'marketCap_y',
    'date'
]

merged = merged.drop(columns=[c for c in drop_cols if c in merged.columns])

# 3. Drop rows without a future label
merged = merged.dropna(subset=['HotScore_future'])
merged.head(4)

Unnamed: 0,symbol,regularMarketPrice,regularMarketChangePercent,regularMarketVolume,averageDailyVolume3Month,marketCap,VolumeSpike,MomentumScore,VolumeScore,VolatilityScore,TrendScore,snapshot_time,HotScore_future,score_time
0,AA,41.845,6.74745,6326454.0,6727448.0,10836350000.0,0.940394,0.903646,0.802083,0.726562,0.520833,2025-11-26 20:46:26,0.794401,2025-11-26 20:46:38
1,AAUC,19.18,9.788214,360438.0,342331.0,2378320000.0,1.052893,0.96875,0.848958,0.622396,0.854167,2025-11-26 20:46:26,0.846094,2025-11-26 20:46:38
2,ALAB,156.16,7.860205,4000639.0,6266829.0,26375990000.0,0.638383,0.942708,0.518229,0.947917,0.723958,2025-11-26 20:46:26,0.773307,2025-11-26 20:46:38
3,ANF,96.205,6.610155,3899150.0,2070173.0,4583525000.0,1.88349,0.898438,0.958333,0.864583,0.828125,2025-11-26 20:46:26,0.905599,2025-11-26 20:46:38


In [19]:
merged = merged.copy()

merged["relevance"] = (
    merged
    .groupby("snapshot_time")["HotScore_future"]
    .transform(
        lambda x: pd.qcut(
            x,
            q=5,                 # 5 relevance levels (0–4)
            labels=False,
            duplicates="drop"
        )
    )
)

merged["relevance"] = merged["relevance"].astype(int)
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39951 entries, 0 to 40171
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   symbol                      39951 non-null  object        
 1   regularMarketPrice          39902 non-null  float64       
 2   regularMarketChangePercent  39902 non-null  float64       
 3   regularMarketVolume         39902 non-null  float64       
 4   averageDailyVolume3Month    39902 non-null  float64       
 5   marketCap                   39902 non-null  float64       
 6   VolumeSpike                 39902 non-null  float64       
 7   MomentumScore               39902 non-null  float64       
 8   VolumeScore                 39902 non-null  float64       
 9   VolatilityScore             39902 non-null  float64       
 10  TrendScore                  39902 non-null  float64       
 11  snapshot_time               39951 non-null  datetime64[ns]


In [20]:
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "regularMarketVolume",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore"
]

X = merged[features].replace([np.inf, -np.inf], np.nan).fillna(0)
y = merged["relevance"]
print(X.shape, y.shape)


(39951, 10) (39951,)


In [21]:
group = merged.groupby("snapshot_time").size().values

print("Groups:", len(group))
print("Example group sizes:", group[:5])


Groups: 805
Example group sizes: [50 50 50 50 50]


In [22]:
from lightgbm import LGBMRanker

rank_model = LGBMRanker(
    objective="lambdarank",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rank_model.fit(
    X,
    y,
    group=group
)

print("✅ LightGBM Ranker trained successfully")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 39951, number of used features: 10
✅ LightGBM Ranker trained successfully


In [37]:
merged["rank_score"] = rank_model.predict(X)

ranked = merged.sort_values(
    ["snapshot_time", "rank_score"],
    ascending=[True, False]
)

ranked[["snapshot_time", "symbol", "rank_score"]].head(10)
ranked.shape

(39951, 16)

In [33]:
import plotly.express as px

latest_snapshot = ranked["snapshot_time"].max()

df_latest = (
    ranked[ranked["snapshot_time"] == latest_snapshot]
    .sort_values("rank_score", ascending=True)
    .tail(20)
)

fig = px.bar(
    df_latest,
    x="rank_score",
    y="symbol",
    orientation="h",
    color="rank_score",
    color_continuous_scale="inferno",
    title=f"AI Ranking — Top 20 Hot Stocks ({latest_snapshot})"
)

fig.update_layout(
    template="plotly_dark",
    height=800,
    yaxis=dict(title="", automargin=True)
)
 

chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_top20.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


In [34]:
importance = pd.Series(
    rank_model.feature_importances_,
    index=features
).sort_values(ascending=True)

fig = px.bar(
    importance,
    orientation="h",
    color=importance.values,
    color_continuous_scale="plasma",
    title="AI Ranking — Feature Importance"
)

fig.update_layout(
    template="plotly_dark",
    height=700
)


chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_feature_importance-6.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')
