In [2]:
import pandas as pd
import numpy as np
import os
import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

from lightgbm import LGBMRanker
from pathlib import Path

pd.set_option("display.max_columns", 100)

In [3]:
DATA_DAILY = Path("data/daily")
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/ranking")

for p in (DATA_DAILY, DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [4]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file

In [5]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
score_data = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))

display(score_data.head(4))

Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0
1,AAUC,2025-11-26 20:46:26,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0
2,ALAB,2025-11-26 20:46:26,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0
3,ANF,2025-11-26 20:46:26,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0


In [7]:
score_data = score_data.copy()

score_data["relevance"] = (
    score_data
    .groupby("date")["HotScore"]
    .transform(
        lambda x: pd.qcut(
            x,
            q=5,                 # 5 relevance levels (0–4)
            labels=False,
            duplicates="drop"
        )
    )
)
 
score_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45622 entries, 0 to 45621
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   symbol                      45622 non-null  object 
 1   date                        45622 non-null  object 
 2   HotScore                    45423 non-null  float64
 3   TrendScore                  45423 non-null  float64
 4   regularMarketPrice          45423 non-null  float64
 5   regularMarketChangePercent  45423 non-null  float64
 6   VolumeSpike                 45423 non-null  float64
 7   averageDailyVolume3Month    45423 non-null  float64
 8   MomentumScore               45423 non-null  float64
 9   VolumeScore                 45423 non-null  float64
 10  VolatilityScore             45423 non-null  float64
 11  marketCap                   45423 non-null  float64
 12  relevance                   45423 non-null  float64
dtypes: float64(11), object(2)
memor

In [9]:
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore"
]

X = score_data[features].replace([np.inf, -np.inf], np.nan).fillna(0)
y = score_data["relevance"]
print(X.shape, y.shape)


(45622, 9) (45622,)


In [10]:
group = score_data.groupby("date").size().values

print("Groups:", len(group))
print("Example group sizes:", group[:5])


Groups: 914
Example group sizes: [50 50 50 50 50]


In [11]:
from lightgbm import LGBMRanker

rank_model = LGBMRanker(
    objective="lambdarank",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rank_model.fit(
    X,
    y,
    group=group
)

print("✅ LightGBM Ranker trained successfully")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 45622, number of used features: 9
✅ LightGBM Ranker trained successfully


In [12]:
score_data["rank_score"] = rank_model.predict(X)

ranked = score_data.sort_values(
    ["date", "rank_score"],
    ascending=[True, False]
)

ranked[["date", "symbol", "rank_score"]].head(10)
ranked.shape

(45622, 14)

In [13]:
import plotly.express as px

latest_snapshot = ranked["date"].max()

df_latest = (
    ranked[ranked["date"] == latest_snapshot]
    .sort_values("rank_score", ascending=True)
    .tail(20)
)

fig = px.bar(
    df_latest,
    x="rank_score",
    y="symbol",
    orientation="h",
    color="rank_score",
    color_continuous_scale="inferno",
    title=f"AI Ranking — Top 20 Hot Stocks ({latest_snapshot})"
)

fig.update_layout(
    template="plotly_dark",
    height=800,
    yaxis=dict(title="", automargin=True)
)
 

chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_top20.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


In [14]:
importance = pd.Series(
    rank_model.feature_importances_,
    index=features
).sort_values(ascending=True)

fig = px.bar(
    importance,
    orientation="h",
    color=importance.values,
    color_continuous_scale="plasma",
    title="AI Ranking — Feature Importance"
)

fig.update_layout(
    template="plotly_dark",
    height=700
)


chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_feature_importance-6.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')
