In [37]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path  

import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [38]:
DATA_KMEANS = Path("data/kmeans") 
OUTPUT_DIR = Path("output/knn")

for p in (DATA_KMEANS, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [39]:

df = pd.read_csv(os.path.join(DATA_KMEANS, "kmeans_watchlist.csv"))
df = df.dropna().reset_index(drop=True)
df.isna().sum()

symbol             0
HotScore           0
MomentumScore      0
VolumeScore        0
VolumeSpike        0
VolatilityScore    0
TrendScore         0
cluster            0
watch_label        0
final_signal       0
dtype: int64

In [40]:
FEATURES = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore"
]

df[FEATURES] = df[FEATURES].replace([np.inf, -np.inf], np.nan)

df = df.dropna(subset=FEATURES).reset_index(drop=True)


In [41]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[FEATURES])

In [42]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(
    n_neighbors=15,
    metric="euclidean"
)

knn.fit(X_scaled)


In [43]:
distances, indices = knn.kneighbors(X_scaled)

# Mean distance excluding self
df["knn_mean_distance"] = distances[:, 1:].mean(axis=1)

# Normalize to 0–1
df["knn_uniqueness"] = (
    df["knn_mean_distance"] - df["knn_mean_distance"].min()
) / (
    df["knn_mean_distance"].max() - df["knn_mean_distance"].min()
)


In [44]:
df["knn_signal"] = pd.cut(
    df["knn_uniqueness"],
    bins=[0.0, 0.4, 0.7, 1.0],
    labels=["CROWDED", "INTERESTING", "RARE"]
)

In [45]:
df["final_rank"] = (
    df["HotScore"] * 0.4 +
    df["MomentumScore"] * 0.2 +
    df["VolumeScore"] * 0.2 +
    df["knn_uniqueness"] * 0.2
)

In [46]:
STREAMLIT_COLUMNS = [
    "symbol",
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore",
    "cluster",
    "watch_label",
    "final_signal",
    "knn_uniqueness",
    "knn_signal",
    "final_rank"
]

df_outcome = df[STREAMLIT_COLUMNS].sort_values(
    "final_rank",
    ascending=False
)


In [None]:
df_outcome.to_csv(DATA_KMEANS / "rank_stocks.csv", index=False) 

In [49]:
import plotly.express as px

# Select top 30 by final_rank
top50 = df[df["final_signal"].isin(["STRONG BUY", "HOT"])].sort_values(
    ["final_rank", "knn_uniqueness"], ascending=False
).head(50)

# Choose metrics to display in heatmap
metrics = [
    "HotScore", "MomentumScore", "VolumeScore", 
    "VolumeSpike", "VolatilityScore", "TrendScore", 
    "knn_uniqueness"
]

# Normalize metrics 0-1 for heatmap colors
top50_norm = top50.copy()
for col in metrics:
    top50_norm[col] = (top50_norm[col] - top50_norm[col].min()) / (
        top50_norm[col].max() - top50_norm[col].min()
    )

# Create heatmap
fig = px.imshow(
    top50_norm[metrics].T,                 # transpose so metrics are y-axis
    x=top50_norm["symbol"],
    y=metrics,
    color_continuous_scale="Blues",        # dark blue → light blue
    aspect="auto",
    text_auto=True
)

fig.update_layout(
    template="plotly_dark",
    title="Top 30 Stocks — Metrics Heatmap",
    xaxis=dict(title="Symbol", tickangle=-45, tickfont=dict(color="white")),
    yaxis=dict(title="Metric", tickfont=dict(color="white")),
    coloraxis_colorbar=dict(title="Normalized Value"),
    paper_bgcolor="rgb(10,10,30)",
    plot_bgcolor="rgb(10,10,30)",
    height=700,
    margin=dict(l=100, r=40, t=80, b=150)
)
fig.show()
output_path = os.path.join(OUTPUT_DIR, "knn_hot_stocks_heatmap.html")
fig.write_html(output_path)
