In [1]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path  

import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
DATA_KMEANS = Path("data/kmeans")
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/kmeans")

for p in (DATA_KMEANS, DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [3]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file


In [4]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
df = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))
df.shape

(45672, 12)

In [5]:
cluster_features = [
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore",
    "HotScore"
]

X = (
    df[cluster_features]
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
kmeans = KMeans(
    n_clusters=4,
    random_state=42,
    n_init=20
)

df["cluster"] = kmeans.fit_predict(X_scaled)


In [8]:
centroids = pd.DataFrame(
    scaler.inverse_transform(kmeans.cluster_centers_),
    columns=cluster_features
)

centroids["cluster_id"] = centroids.index
display(centroids.round(3))


Unnamed: 0,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,HotScore,cluster_id
0,0.88,0.835,1.311,0.584,0.48,0.765,0
1,0.852,0.797,1.38,0.866,0.862,0.837,1
2,0.495,0.548,0.596,0.632,0.658,0.557,2
3,0.903,0.998,24.799,0.849,0.692,0.904,3


In [9]:
def assign_watch_label(row):
    if row["HotScore"] > 0.95 and row["MomentumScore"] > 0.95:
        return "🔥 HOT"
    elif row["HotScore"] > 0.9 and row["VolatilityScore"] > 0.8:
        return "⚠️ OVERHEATED"
    elif row["TrendScore"] > 0.85:
        return "👀 WATCH"
    else:
        return "❌ IGNORE"

df["watch_label"] = df.apply(assign_watch_label, axis=1)


In [10]:
df["final_signal"] = np.where(
    (df["HotScore"] > 0.95) & (df["watch_label"] == "🔥 HOT"),
    "STRONG BUY",
    np.where(
        df["watch_label"] == "👀 WATCH",
        "WATCH",
        "IGNORE"
    )
)

display(df[["symbol", "HotScore", "watch_label", "final_signal"]].head(10))


Unnamed: 0,symbol,HotScore,watch_label,final_signal
0,AA,0.794401,❌ IGNORE,IGNORE
1,AAUC,0.846094,👀 WATCH,WATCH
2,ALAB,0.773307,❌ IGNORE,IGNORE
3,ANF,0.905599,⚠️ OVERHEATED,IGNORE
4,ARWR,0.95638,🔥 HOT,STRONG BUY
5,ASML,0.80013,👀 WATCH,WATCH
6,ATGE,0.738542,👀 WATCH,WATCH
7,ATMU,0.902604,👀 WATCH,WATCH
8,BBAR,0.825521,❌ IGNORE,IGNORE
9,BE,0.801432,❌ IGNORE,IGNORE


In [11]:
pca = PCA(n_components=2)
coords = pca.fit_transform(X_scaled)

df["pca_x"] = coords[:, 0]
df["pca_y"] = coords[:, 1]


In [12]:
fig = px.scatter(
    df,
    x="pca_x",
    y="pca_y",
    color="watch_label",
    hover_data=[
        "symbol",
        "HotScore",
        "MomentumScore",
        "VolumeSpike",
        "TrendScore"
    ],
    title="Stock Behavior Clusters (KMeans + PCA)"
)

fig.update_layout(
    template="plotly_dark",
    legend_title_text="Signal"
)

fig.show()
chart_path = OUTPUT_DIR / "cluster_map.html"
fig.write_html(chart_path, include_plotlyjs="cdn")

print(f"Cluster map saved to: {chart_path}")


Cluster map saved to: output\kmeans\cluster_map.html


In [13]:
fig = px.pie(
    df,
    names="watch_label",
    title="Market Signal Distribution"
)

fig.update_layout(template="plotly_dark")

fig.show()
chart_path = OUTPUT_DIR / "kmeans_signal_distribution.html"
fig.write_html(chart_path, include_plotlyjs="cdn")

print(f"Signal distribution chart saved to: {chart_path}")

Signal distribution chart saved to: output\kmeans\kmeans_signal_distribution.html


In [None]:
metrics = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore"
]

df["date"] = pd.to_datetime(df["date"])

# Keep only latest record per symbol
latest_df = (
    df.sort_values("date")
      .groupby("symbol", as_index=False)
      .tail(1)
)

top50 = (
    latest_df
    .sort_values("HotScore", ascending=False)
    .head(50)
    .set_index("symbol")
)


Unnamed: 0_level_0,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,cluster,watch_label,final_signal,pca_x,pca_y
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ULTA,2025-12-08 14:21:59,0.988658,0.99361,601.5,12.650995,5.513537,615858.0,0.98722,0.984026,0.996805,27034780000.0,1,🔥 HOT,STRONG BUY,3.012242,-0.087355
BLTE,2025-12-30 14:25:24,0.987793,0.995305,165.29,6.68689,1.781145,173361.0,0.985915,0.985915,0.99061,6200793000.0,1,🔥 HOT,STRONG BUY,2.673791,0.417819
PEN,2026-01-15 22:47:31,0.987116,0.940898,350.49,11.824,19.162019,504898.0,0.98818,1.0,0.985816,13725950000.0,3,🔥 HOT,STRONG BUY,4.141702,-2.183154
TFX,2025-12-10 14:22:08,0.980758,0.962099,131.25,9.53931,4.736953,632731.0,0.988338,0.982507,0.973761,5800516000.0,1,🔥 HOT,STRONG BUY,2.820305,-0.11073


In [56]:
import plotly.express as px
import pandas as pd
import numpy as np

# Create a normalized score column for coloring (0-1)
top50["score_norm"] = (top50["HotScore"] - top50["HotScore"].min()) / \
                      (top50["HotScore"].max() - top50["HotScore"].min())

# Map normalized scores to color scale
colorscale = [[0, "red"], [0.2, "orange"], [0.4, "yellow"], [0.6, "limegreen"], [0.8, "green"], [1, "blue"]]

fig = px.bar(
    top50.sort_values("HotScore"),  # sort for horizontal bars
    x="HotScore",
    y=top50.index,                  # symbol as y-axis
    orientation="h",
    color="score_norm",
    color_continuous_scale=colorscale,
    hover_data=["MomentumScore", "VolumeScore", "TrendScore", "VolumeSpike", "watch_label"],
    title="Top 50 Hot Stocks – Horizontal Bar Chart"
)

# Update layout for dark / green theme + white labels
fig.update_layout(
    template=None,
    paper_bgcolor="#0F1A11",      # black outside chart
    plot_bgcolor="#011B3E",       # dark green inside chart
    xaxis_title="HotScore",
    yaxis_title="Ticker",
    yaxis=dict(tickfont=dict(color="white")),
    xaxis=dict(tickfont=dict(color="white")),
    title=dict(font=dict(color="white", size=22)),
    coloraxis_colorbar=dict(
        title="Hotness",
        tickfont=dict(color="white")
    ),
    height=1200
)

fig.show()

chart_path = OUTPUT_DIR / "kmeans_heatmap.html"
fig.write_html(chart_path, include_plotlyjs="cdn")


In [None]:
output_cols = [
    "symbol",
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore",
    "cluster",
    "watch_label",
    "final_signal"
]

out_path = DATA_KMEANS / "kmeans_watchlist.csv"
df[output_cols].to_csv(out_path, index=False)

In [59]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Example: top50 is already latest snapshot, one row per symbol
top50 = top50.sort_values("HotScore", ascending=False)  # Top 50 sorted

# Normalize HotScore for coloring (0 → 1)
top50["score_norm"] = (top50["HotScore"] - top50["HotScore"].min()) / \
                      (top50["HotScore"].max() - top50["HotScore"].min())

# Define watch_label color mapping
cluster_colors = {
    "🔥 HOT": "#FF4500",        # bright orange
    "👀 WATCH": "#FFD700",      # yellow
    "⚠️ OVERHEATED": "#FF0000", # red
    "❌ IGNORE": "#888888"       # gray
}

# Create horizontal bars manually
fig = go.Figure()

for i, row in enumerate(top50.itertuples()):
    fig.add_trace(go.Bar(
        x=[row.HotScore],
        y=[row.Index],  # symbol
        orientation='h',
        marker=dict(
            color=row.score_norm,  # continuous hot-to-cool
            colorscale=[[0, 'gray'], [1, 'black']],
            line=dict(color=cluster_colors.get(row.watch_label, "white"), width=2)
        ),
        name=row.watch_label,
        hovertemplate=(
            f"Symbol: {row.Index}<br>"
            f"HotScore: {row.HotScore:.3f}<br>"
            f"Momentum: {row.MomentumScore:.3f}<br>"
            f"VolumeScore: {row.VolumeScore:.3f}<br>"
            f"TrendScore: {row.TrendScore:.3f}<br>"
            f"VolumeSpike: {row.VolumeSpike:.2f}<br>"
            f"Label: {row.watch_label}<br>"
            "<extra></extra>"
        )
    ))

# Add rank annotations on the bars
annotations = []
for i, row in enumerate(top50.itertuples()):
    annotations.append(dict(
        x=row.HotScore + 0.01,  # slight offset right
        y=row.Index,
        text=str(i+1),
        xanchor="left",
        yanchor="middle",
        font=dict(color="white", size=12),
        showarrow=False
    ))

fig.update_layout(
    template=None,
    paper_bgcolor="#000000",
    plot_bgcolor="#001a00",  # dark green
    height=1200,
    xaxis=dict(
        title="HotScore",
        tickfont=dict(color="white"),
        showgrid=True,
        gridcolor="rgba(255,255,255,0.05)"
    ),
    yaxis=dict(
        title="Ticker",
        tickfont=dict(color="white"),
        categoryorder="total ascending"
    ),
    title=dict(
        text="Top 50 Hot Stocks – Horizontal Clustered Bar",
        font=dict(color="white", size=22)
    ),
    annotations=annotations,
    showlegend=False
)

fig.show()
