In [1]:
import pandas as pd
import numpy as np
import os
import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

from lightgbm import LGBMRanker
from pathlib import Path

pd.set_option("display.max_columns", 100)

In [2]:
DATA_DAILY = Path("data/daily")
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/ranking")

for p in (DATA_DAILY, DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [3]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file

In [4]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
score_data = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))

display(score_data.head(4))

Unnamed: 0,symbol,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap
0,AGI,0.425581,0.511628,43.46,3.010193,0.164952,3314950.0,0.313953,0.523256,0.406977,18245990000.0
1,MHK,0.43314,0.94186,138.105,3.488197,0.079435,1044685.0,0.395349,0.127907,0.77907,8580727000.0
2,LCID,0.446512,0.186047,10.4199,5.044347,0.146747,7922201.0,0.686047,0.430233,0.186047,3377803000.0
3,MTH,0.447093,0.906977,80.0,4.150947,0.062306,960236.0,0.55814,0.081395,0.662791,5632537000.0


In [13]:
score_data.describe()

Unnamed: 0,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,relevance
count,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0
mean,0.821184,0.730203,120.64127,9.090338,2.185287,3846497.0,0.852481,0.824646,0.805849,29963480000.0,2.0
std,0.088251,0.19635,115.769197,6.417463,3.05645,6296471.0,0.145528,0.138122,0.155414,52561040000.0,1.415303
min,0.425581,0.087838,8.3047,-20.961641,0.062306,176877.0,0.011628,0.081395,0.011628,2046591000.0,0.0
25%,0.785093,0.634825,42.31,5.748698,0.865634,976295.0,0.7838,0.758527,0.702808,4912676000.0,1.0
50%,0.824056,0.774579,81.495,7.34006,1.495169,1798745.0,0.895436,0.847583,0.842046,11882440000.0,2.0
75%,0.869341,0.872928,150.745,9.687537,2.442449,3827273.0,0.961647,0.930883,0.927854,26666920000.0,3.0
max,0.984375,1.0,957.87,72.287865,27.414272,37339140.0,1.0,1.0,1.0,446725900000.0,4.0


In [5]:
score_data = score_data.copy()

score_data["relevance"] = (
    score_data["HotScore"]
    .transform(
        lambda x: pd.qcut(
            x,
            q=5,                 # 5 relevance levels (0–4)
            labels=False,
            duplicates="drop"
        )
    )
)
 
score_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   symbol                      650 non-null    object 
 1   HotScore                    650 non-null    float64
 2   TrendScore                  650 non-null    float64
 3   regularMarketPrice          650 non-null    float64
 4   regularMarketChangePercent  650 non-null    float64
 5   VolumeSpike                 650 non-null    float64
 6   averageDailyVolume3Month    650 non-null    float64
 7   MomentumScore               650 non-null    float64
 8   VolumeScore                 650 non-null    float64
 9   VolatilityScore             650 non-null    float64
 10  marketCap                   650 non-null    float64
 11  relevance                   650 non-null    int64  
dtypes: float64(10), int64(1), object(1)
memory usage: 61.1+ KB


In [6]:
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore"
]

X = score_data[features].replace([np.inf, -np.inf], np.nan).fillna(0)
y = score_data["relevance"]
print(X.shape, y.shape)


(650, 9) (650,)


In [14]:
from lightgbm import LGBMRanker

group = [len(X)]

rank_model = LGBMRanker(
    objective="lambdarank",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rank_model.fit(
    X,
    y,
    group=group
)

print("✅ LightGBM Ranker trained successfully")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1809
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 9
✅ LightGBM Ranker trained successfully


In [16]:
score_data["rank_score"] = rank_model.predict(X)

ranked = score_data.sort_values(
    ["rank_score"],
    ascending=[False]
)

ranked[["symbol", "rank_score"]].head(10)
ranked.shape

(650, 13)

In [17]:
import plotly.express as px


fig = px.bar(
    ranked,
    x="rank_score",
    y="symbol",
    orientation="h",
    color="rank_score",
    color_continuous_scale="inferno",
    title=f"AI Ranking — Top 20 Hot Stocks"
)

fig.update_layout(
    template="plotly_dark",
    height=800,
    yaxis=dict(title="", automargin=True)
)
 

chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_top20.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


In [18]:
importance = pd.Series(
    rank_model.feature_importances_,
    index=features
).sort_values(ascending=True)

fig = px.bar(
    importance,
    orientation="h",
    color=importance.values,
    color_continuous_scale="plasma",
    title="AI Ranking — Feature Importance"
)

fig.update_layout(
    template="plotly_dark",
    height=700
)


chart_path = os.path.join(OUTPUT_DIR, f"ai_ranking_feature_importance-6.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')
