In [2]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path  

import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [None]:
DATA_KMEANS = Path("data/kmeans")

for p in (DATA_KMEANS):
    p.mkdir(parents=True, exist_ok=True)

In [4]:

df = pd.read_csv(os.path.join(DATA_KMEANS, "kmeans_watchlist.csv"))

interesting_signals = ["STRONG BUY", "BUY"]
df_filtered = df[df["final_signal"].isin(interesting_signals)]
df_filtered.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal
4,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,üî• HOT,STRONG BUY
41,SYM,0.957292,0.989583,0.973958,2.340527,0.940104,0.820312,1,üî• HOT,STRONG BUY
46,URBN,0.958464,0.986979,0.994792,4.050246,0.921875,0.804688,1,üî• HOT,STRONG BUY
54,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,üî• HOT,STRONG BUY
91,SYM,0.957292,0.989583,0.973958,2.340527,0.940104,0.820312,1,üî• HOT,STRONG BUY


In [5]:
df = df.dropna().reset_index(drop=True)
df.isna().sum()

symbol             0
HotScore           0
MomentumScore      0
VolumeScore        0
VolumeSpike        0
VolatilityScore    0
TrendScore         0
cluster            0
watch_label        0
final_signal       0
dtype: int64

In [6]:
FEATURES = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolumeSpike",
    "VolatilityScore",
    "TrendScore",
    "cluster"
]

TARGET = "final_signal"

df[FEATURES] = df[FEATURES].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in any feature or target
df_model = df.dropna(subset=FEATURES + [TARGET]).reset_index(drop=True)


In [7]:
le = LabelEncoder()
df_model["y"] = le.fit_transform(df_model[TARGET])


In [8]:
split_idx = int(0.8 * len(df_model))

X_train = df_model[FEATURES].iloc[:split_idx].copy()
X_test  = df_model[FEATURES].iloc[split_idx:].copy()

y_train = df_model["y"].iloc[:split_idx].copy()
y_test  = df_model["y"].iloc[split_idx:].copy()

In [9]:
for col in FEATURES:
    X_train[col] = X_train[col].clip(-1e6, 1e6)
    X_test[col]  = X_test[col].clip(-1e6, 1e6)

In [10]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

In [19]:
y_pred = rf.predict(X_test)
report_dict = classification_report(
    y_test,
    y_pred,
    target_names=le.classes_,
    output_dict=True
)

df_report = pd.DataFrame(report_dict).T
df_report.style \
    .format("{:.2f}") \
    .background_gradient(cmap="Blues") \
    .set_caption("Classification Report")


df_report

Unnamed: 0,precision,recall,f1-score,support
IGNORE,1.0,0.999378,0.999689,6427.0
STRONG BUY,0.986667,1.0,0.993289,296.0
WATCH,1.0,1.0,1.0,2618.0
accuracy,0.999572,0.999572,0.999572,0.999572
macro avg,0.995556,0.999793,0.997659,9341.0
weighted avg,0.999577,0.999572,0.999573,9341.0


In [None]:

df_model["predicted_signal"] = le.inverse_transform(rf.predict(df_model[FEATURES]))
df_model["confidence"] = rf.predict_proba(df_model[FEATURES]).max(axis=1)

# Check final dataframe
df_model.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal,y,predicted_signal,confidence
0,AA,0.794401,0.903646,0.802083,0.940394,0.726562,0.520833,2,‚ùå IGNORE,IGNORE,0,IGNORE,0.996082
1,AAUC,0.846094,0.96875,0.848958,1.052893,0.622396,0.854167,1,üëÄ WATCH,WATCH,2,WATCH,0.907984
2,ALAB,0.773307,0.942708,0.518229,0.638383,0.947917,0.723958,1,‚ùå IGNORE,IGNORE,0,IGNORE,0.930574
3,ANF,0.905599,0.898438,0.958333,1.88349,0.864583,0.828125,1,‚ö†Ô∏è OVERHEATED,IGNORE,0,IGNORE,0.9872
4,ARWR,0.95638,1.0,0.976562,2.501594,0.955729,0.734375,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.996962


In [None]:
interesting_stocks = df_model[
    (df_model["predicted_signal"].isin(["STRONG BUY", "HOT"])) &
    (df_model["confidence"] > 0.9)
].sort_values("confidence", ascending=False)

# Show top 20
interesting_stocks.head()

Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolumeSpike,VolatilityScore,TrendScore,cluster,watch_label,final_signal,y,predicted_signal,confidence
31357,UNF,0.992907,0.99308,0.989619,4.763651,1.0,0.989619,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.999915
31505,UNF,0.992712,0.99322,0.989831,5.601591,1.0,0.986441,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.999915
31606,UNF,0.994276,0.993266,0.993266,5.781297,1.0,0.989899,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.999915
13602,GPCR,0.977899,0.992754,0.985507,12.510911,0.992754,0.869565,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.999915
14411,KYMR,0.986879,0.992908,0.985816,8.808506,0.992908,0.957447,1,üî• HOT,STRONG BUY,1,STRONG BUY,0.999915


In [21]:

interesting_stocks.to_csv(DATA_KMEANS / "interesting_stocks.csv", index=False) 