In [3]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path  

import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [4]:
DATA_KMEANS = Path("data/kmeans")
OUTPUT_DIR = Path("output/random_forest")

for p in (DATA_KMEANS, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [5]:

df = pd.read_csv(os.path.join(DATA_KMEANS, "kmeans_watchlist.csv"))

interesting_signals = ["STRONG BUY", "BUY"]
df_filtered = df[df["final_signal"].isin(interesting_signals)]
df_filtered.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal
4,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,🔥 HOT,STRONG BUY
41,SYM,0.957292,0.989583,0.973958,2.340527,0.940104,0.820312,1,🔥 HOT,STRONG BUY
46,URBN,0.958464,0.986979,0.994792,4.050246,0.921875,0.804688,1,🔥 HOT,STRONG BUY
54,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,🔥 HOT,STRONG BUY
91,SYM,0.957292,0.989583,0.973958,2.340527,0.940104,0.820312,1,🔥 HOT,STRONG BUY


In [7]:
df = df.dropna().reset_index(drop=True)
df.isna().sum()

symbol             0
HotScore           0
MomentumScore      0
VolumeScore        0
VolumeSpike        0
VolatilityScore    0
TrendScore         0
cluster            0
watch_label        0
final_signal       0
dtype: int64

In [8]:
FEATURES = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore",
    "cluster"
]

TARGET = "final_signal"

df[FEATURES] = df[FEATURES].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in any feature or target
df_model = df.dropna(subset=FEATURES + [TARGET]).reset_index(drop=True)


In [56]:
le = LabelEncoder()
df_model["y"] = le.fit_transform(df_model[TARGET])
labels = le.classes_
print("Classes:", labels)


Classes: ['IGNORE' 'STRONG BUY' 'WATCH']


In [23]:
split_idx = int(0.8 * len(df_model))

X_train = df_model[FEATURES].iloc[:split_idx].copy()
X_test  = df_model[FEATURES].iloc[split_idx:].copy()

y_train = df_model["y"].iloc[:split_idx].copy()
y_test  = df_model["y"].iloc[split_idx:].copy()

In [24]:
for col in FEATURES:
    X_train[col] = X_train[col].clip(-1e6, 1e6)
    X_test[col]  = X_test[col].clip(-1e6, 1e6)

In [25]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

In [62]:
y_pred = rf.predict(X_test)
report = classification_report(y_test, y_pred, target_names=le.classes_)
print(report)

              precision    recall  f1-score   support

      IGNORE       1.00      1.00      1.00      6377
  STRONG BUY       0.97      1.00      0.98       282
       WATCH       1.00      1.00      1.00      2482

    accuracy                           1.00      9141
   macro avg       0.99      1.00      0.99      9141
weighted avg       1.00      1.00      1.00      9141



In [64]:

# ========================
# 8️⃣ Add predictions back to dataframe for charts / analysis
# ========================
df_model["predicted_signal"] = le.inverse_transform(rf.predict(df_model[FEATURES]))
df_model["confidence"] = rf.predict_proba(df_model[FEATURES]).max(axis=1)

# Check final dataframe
df_model.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal,y,predicted_signal,confidence
0,AA,0.794401,0.903646,0.802083,0.940394,0.726562,0.520833,0,❌ IGNORE,IGNORE,0,IGNORE,0.998068
1,AAUC,0.846094,0.96875,0.848958,1.052893,0.622396,0.854167,1,👀 WATCH,WATCH,2,WATCH,0.904453
2,ALAB,0.773307,0.942708,0.518229,0.638383,0.947917,0.723958,1,❌ IGNORE,IGNORE,0,IGNORE,0.946646
3,ANF,0.905599,0.898438,0.958333,1.88349,0.864583,0.828125,1,⚠️ OVERHEATED,IGNORE,0,IGNORE,0.995107
4,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,🔥 HOT,STRONG BUY,1,STRONG BUY,0.994606


In [65]:
interesting_stocks = df_model[
    (df_model["predicted_signal"].isin(["STRONG BUY", "HOT"])) &
    (df_model["confidence"] > 0.9)
].sort_values("confidence", ascending=False)

# Show top 20
interesting_stocks.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal,y,predicted_signal,confidence
13852,GPCR,0.981925,0.995305,0.99061,14.346246,0.995305,0.877934,3,🔥 HOT,STRONG BUY,1,STRONG BUY,0.999728
14002,GPCR,0.983716,0.995413,0.990826,14.799765,0.990826,0.90367,3,🔥 HOT,STRONG BUY,1,STRONG BUY,0.999728
13953,GPCR,0.982558,0.995349,0.990698,14.420636,0.995349,0.883721,3,🔥 HOT,STRONG BUY,1,STRONG BUY,0.999728
13902,GPCR,0.981925,0.995305,0.99061,14.346246,0.995305,0.877934,3,🔥 HOT,STRONG BUY,1,STRONG BUY,0.999728
14052,GPCR,0.983716,0.995413,0.990826,14.799765,0.990826,0.90367,3,🔥 HOT,STRONG BUY,1,STRONG BUY,0.999728


In [None]:
interesting_stocks.to_csv(DATA_KMEANS / "interesting_stocks.csv", index=False) 

In [66]:
import plotly.graph_objects as go
import plotly.express as px

# Define a dark color palette
dark_colors = {
    "IGNORE": "#0c8e03",      # dark gray / greenish
    "WATCH": "#0404e0",       # dark blue
    "HOT": "#DE07FB",         # black
    "STRONG BUY": "#8b0000"   # dark red
}

fig = px.scatter(
    df_model,
    x="HotScore",
    y="VolumeSpike",
    color="predicted_signal",
    size="confidence",
    hover_data=["symbol", "MomentumScore", "TrendScore"],
    title="Stocks by HotScore vs VolumeSpike",
    color_discrete_map=dark_colors,
    size_max=20
)

# Update layout for dark theme
fig.update_layout(
    template="plotly_dark",
    paper_bgcolor="rgb(10,10,30)",   # dark paper background
    plot_bgcolor="rgb(10,10,30)",    # dark plot background
    title_font=dict(size=22, color="white"),
    xaxis=dict(title="HotScore", color="white", gridcolor="gray"),
    yaxis=dict(title="VolumeSpike", color="white", gridcolor="gray"),
    legend=dict(title="Predicted Signal", font=dict(color="white"))
)

fig.show()


In [67]:
import plotly.graph_objects as go

# Define colors
colors = {
    "STRONG BUY": "#ef0404"    # dark red
}

# Filter top STRONG BUY only (already done in interesting_stocks)
fig = go.Figure()

# Only one signal, STRONG BUY
df_sig = interesting_stocks.sort_values("confidence", ascending=False)
fig.add_trace(go.Bar(
    x=df_sig["symbol"],
    y=df_sig["confidence"],
    name="STRONG BUY",
    marker_color=colors["STRONG BUY"],
    text=df_sig.apply(lambda row: f"HotScore: {row.HotScore:.2f}<br>MomentumScore: {row.MomentumScore:.2f}", axis=1),
    hoverinfo="text"
))

# Update layout with new label/title
fig.update_layout(
    barmode="group",
    title="Top STRONG BUY Stocks by Confidence",
    template="plotly_dark",
    paper_bgcolor="rgb(10,10,30)",
    plot_bgcolor="rgb(10,10,30)",
    yaxis=dict(title="Confidence", color="white"),
    xaxis=dict(title="Symbol", color="white"),
    showlegend=True,
    legend=dict(title="Predicted Signal", font=dict(color="white"))
)

fig.show()


In [71]:
# ---------------------------
# 1️⃣ Compute confusion matrix
# ---------------------------
y_true = df_model["y"]
y_pred = le.transform(df_model["predicted_signal"])
cm = confusion_matrix(y_true, y_pred)

# Normalize to percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# ---------------------------
# 2️⃣ Classification metrics
# ---------------------------
report = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)

# Use only the class labels present in report (skip "accuracy", etc.)
class_labels = [k for k in report.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]

# Prepare hover text for each cell
hover_text = []
for i, actual in enumerate(class_labels):
    hover_text.append([])
    for j, pred in enumerate(class_labels):
        precision = report[pred]["precision"]
        recall = report[pred]["recall"]
        f1 = report[pred]["f1-score"]
        hover_text[-1].append(
            f"Actual: {actual}<br>Predicted: {pred}<br>"
            f"Count: {cm[i,j]}<br>"
            f"Precision: {precision:.2f}<br>"
            f"Recall: {recall:.2f}<br>"
            f"F1-score: {f1:.2f}"
        )

# ---------------------------
# 3️⃣ Create dark-themed heatmap
# ---------------------------
fig = go.Figure(data=go.Heatmap(
    z=cm_percent,
    x=class_labels,
    y=class_labels,
    text=hover_text,
    hoverinfo="text",
    colorscale=[[0, 'black'], [0.5, 'blue'], [1, 'white']],  # Black → Blue → White
    reversescale=False,
    colorbar=dict(title="Percent", tickfont=dict(color="white"))
))

# ---------------------------
# 4️⃣ Layout customization
# ---------------------------
fig.update_layout(
    title="Confusion Matrix (Predicted vs Actual) with Metrics",
    template="plotly_dark",
    paper_bgcolor="rgb(10,10,30)",
    plot_bgcolor="rgb(10,10,30)",
    xaxis_title="Predicted Signal",
    yaxis_title="Actual Signal",
    xaxis=dict(tickfont=dict(color="white")),
    yaxis=dict(tickfont=dict(color="white")),
)

fig.show()
