In [2]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path 

In [None]:
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/logistic_regression")

for p in (DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [4]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file


In [5]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
df = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))
df.shape

(45622, 12)

In [6]:
display(df.head())

Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0
1,AAUC,2025-11-26 20:46:26,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0
2,ALAB,2025-11-26 20:46:26,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0
3,ANF,2025-11-26 20:46:26,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0
4,ARWR,2025-11-26 20:46:26,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0


In [7]:
df = df.copy()

# Target: HotScore > 0.85
df["is_hot"] = (df["HotScore"] > 0.85).astype(int)

# Clip values to avoid log(0) or negative values
df["marketCap"] = df["marketCap"].clip(lower=1)
df["averageDailyVolume3Month"] = df["averageDailyVolume3Month"].clip(lower=1)

# Log-transform
df["log_marketCap"] = np.log(df["marketCap"])
df["log_volume"] = np.log(df["averageDailyVolume3Month"])

# Features (exclude symbol, date, HotScore)
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "VolumeSpike",
    "log_marketCap",
    "log_volume"
]

X = df[features]
y = df["is_hot"]

# Keep symbol and date for final table
symbol_date = df[["symbol", "date"]]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, sym_train, sym_test = train_test_split(
    X, y, symbol_date, test_size=0.25, random_state=42, stratify=y
)

In [9]:
has_inf = np.isinf(X_train.values).any()
has_nan = np.isnan(X_train.values).any()
needs_cleaning = has_inf or has_nan
print("Need cleaning?", needs_cleaning)

if needs_cleaning:
    print("Cleaning X_train and X_test...")
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Keep indexes aligned
    keep_train_idx = X_train.dropna().index
    X_train = X_train.loc[keep_train_idx]
    y_train = y_train.loc[keep_train_idx]

    keep_test_idx = X_test.dropna().index
    X_test = X_test.loc[keep_test_idx]
    y_test = y_test.loc[keep_test_idx]
    sym_test = sym_test.loc[keep_test_idx]


Need cleaning? True
Cleaning X_train and X_test...


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

log_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)
log_model.fit(X_train_scaled, y_train)

In [12]:
y_proba = log_model.predict_proba(X_test_scaled)[:, 1]
y_pred = log_model.predict(X_test_scaled)

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

ROC AUC: 0.8691047832440331
              precision    recall  f1-score   support

           0       0.90      0.81      0.85      8212
           1       0.61      0.76      0.68      3125

    accuracy                           0.80     11337
   macro avg       0.75      0.79      0.76     11337
weighted avg       0.82      0.80      0.80     11337



In [13]:
coef_df = pd.DataFrame({
    "feature": features,
    "coefficient": log_model.coef_[0]
}).sort_values("coefficient", ascending=False)

display(coef_df)

Unnamed: 0,feature,coefficient
1,regularMarketChangePercent,2.431432
2,VolumeSpike,2.01869
3,log_marketCap,0.70409
0,regularMarketPrice,0.025866
4,log_volume,-0.979273


In [14]:
results = X_test.copy()
results["symbol"] = sym_test["symbol"].values
results["date"] = sym_test["date"].values
results["pred_proba_hot"] = y_proba
results["pred_hot"] = y_pred
results["true_hot"] = y_test.values

# Sort by predicted probability descending
results_sorted = results.sort_values("pred_proba_hot", ascending=False)

# Show top 5 predicted hottest stocks
display(results_sorted.head(5))

Unnamed: 0,regularMarketPrice,regularMarketChangePercent,VolumeSpike,log_marketCap,log_volume,symbol,date,pred_proba_hot,pred_hot,true_hot
14849,18.52,147.26303,55.681207,21.853444,14.764999,WVE,2025-12-08 21:39:21,1.0,1,1
44110,55.04,138.2684,1.938699,21.736518,12.384357,THH,2026-01-13 20:48:12,1.0,1,1
15099,18.52,147.26303,55.681207,21.853444,14.764999,WVE,2025-12-08 22:43:36,1.0,1,1
14599,18.34,144.7632,54.867382,21.843677,14.764999,WVE,2025-12-08 20:56:27,1.0,1,1
14969,69.98,102.48842,18.389559,22.169295,13.866346,GPCR,2025-12-08 22:19:37,1.0,1,1


In [15]:
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as colors 
import plotly.express as px

In [16]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(50).index
df_top = results_sorted[results_sorted['symbol'].isin(top_symbols)]

In [17]:
heatmap_df = df_top.pivot(index='symbol', columns='date', values='pred_proba_hot')

fig_heatmap = px.imshow(
    heatmap_df,
    labels=dict(x="Date", y="Symbol", color="Predicted Hot Probability"),
    color_continuous_scale="YlOrRd",
    text_auto=False,
    aspect="auto"
)

fig_heatmap.update_layout(
    title="Predicted Hot-Stock Probability Heatmap",
    xaxis_nticks=20,
    yaxis={'categoryorder':'total ascending'},
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"predicted_hot_probability.html")
fig_heatmap.write_html(chart_path, include_plotlyjs='cdn')


In [18]:
fig_bubble = px.scatter(
    df_top.head(100),  # top 100 predicted hot stocks
    x="VolumeSpike",
    y="regularMarketChangePercent",
    size="log_volume",
    color="pred_proba_hot",
    hover_name="symbol",
    hover_data=["date", "regularMarketPrice", "log_marketCap"],
    color_continuous_scale="YlOrRd",
    size_max=25
)

fig_bubble.update_layout(
    title="Hot Stocks Feature Space",
    xaxis_title="Volume Spike",
    yaxis_title="Price Change %",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"hot_stocks_feature_space.html")
fig_bubble.write_html(chart_path, include_plotlyjs='cdn')

In [19]:
top20 = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(20).reset_index()
fig_bar = px.bar(
    top20,
    x='symbol',
    y='pred_proba_hot',
    color='pred_proba_hot',
    color_continuous_scale='YlOrRd',
    text='pred_proba_hot'
)

fig_bar.update_layout(
    title="Top 20 Predicted Hot Stocks",
    xaxis_title="Symbol",
    yaxis_title="Predicted Probability",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"top20_predicted_hot_stocks.html")
fig_bar.write_html(chart_path, include_plotlyjs='cdn')

In [20]:
# Example using market cap bins
results_sorted['marketCap_bin'] = pd.qcut(results_sorted['log_marketCap'], 4, labels=['Small','Mid','Large','Mega'])

fig_tree = px.treemap(
    results_sorted.head(100),
    path=['marketCap_bin','symbol'],
    values='pred_proba_hot',
    color='pred_proba_hot',
    color_continuous_scale='YlOrRd',
)

fig_tree.update_layout(
    title="Treemap of Top 100 Hot Stocks by Market Cap",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"treemap_hot_stocks.html")
fig_tree.write_html(chart_path, include_plotlyjs='cdn')


In [21]:
fig_3d = px.scatter_3d(
    results_sorted.head(100),
    x="VolumeSpike",
    y="regularMarketChangePercent",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    color_continuous_scale='YlOrRd',
    size_max=20
)

fig_3d.update_layout(
    title="3D Hot Stock Feature Space",
    scene=dict(
        xaxis_title='VolumeSpike',
        yaxis_title='Price Change %',
        zaxis_title='log(MarketCap)'
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"3d_hot_stocks.html")
fig_3d.write_html(chart_path, include_plotlyjs='cdn')


In [22]:
# Take top 200 predicted hot stocks for performance & cool effect
df_top3d = results_sorted.sort_values("pred_proba_hot", ascending=False).head(200)

fig_3d_cool = px.scatter_3d(
    df_top3d,
    x="regularMarketChangePercent",
    y="VolumeSpike",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    hover_data=["date", "regularMarketPrice"],
    color_continuous_scale="Turbo",  # vibrant neon-style colors
    size_max=20
)

# Dark theme & style adjustments
fig_3d_cool.update_layout(
    title="üî• 3D Galaxy of Hot Stocks üî•",
    scene=dict(
        xaxis_title='Price Change %',
        yaxis_title='Volume Spike',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

# Optional: make markers ‚Äúglow‚Äù by adjusting opacity
fig_3d_cool.update_traces(marker=dict(opacity=0.9, line=dict(width=0.5, color='white')))

chart_path = os.path.join(OUTPUT_DIR, f"3d_galaxy_hot_stocks.html")
fig_3d_cool.write_html(chart_path, include_plotlyjs='cdn')



In [23]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(100).index
df_ani = results_sorted[results_sorted['symbol'].isin(top_symbols)].copy()

# Ensure date is sorted
df_ani['date'] = pd.to_datetime(df_ani['date'])
df_ani = df_ani.sort_values('date')

# ----------------------------
# 2Ô∏è‚É£ Animated 3D scatter (galaxy)
# ----------------------------
fig_3d_anim = px.scatter_3d(
    df_ani,
    x="regularMarketChangePercent",
    y="VolumeSpike",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    hover_data=["regularMarketPrice", "log_marketCap"],
    animation_frame=df_ani['date'].dt.strftime('%Y-%m-%d %H:%M:%S'),  # animate by snapshot
    color_continuous_scale="Turbo",
    size_max=25
)

# ----------------------------
# 3Ô∏è‚É£ Style layout for dark theme
# ----------------------------
fig_3d_anim.update_layout(
    title="üåå Animated 3D Galaxy of Hot Stocks üåå",
    scene=dict(
        xaxis_title='Price Change %',
        yaxis_title='Volume Spike',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

# Optional: slightly transparent markers for galaxy effect
fig_3d_anim.update_traces(marker=dict(opacity=0.8, line=dict(width=0.5, color='white')))

chart_path = os.path.join(OUTPUT_DIR, f"3d_animated_hot_stocks.html")
fig_3d_anim.write_html(chart_path, include_plotlyjs='cdn')

In [24]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(100).index
df_top = results_sorted[results_sorted['symbol'].isin(top_symbols)].copy()
df_top['date'] = pd.to_datetime(df_top['date'])

# ----------------------------
# Create spiral coordinates
# ----------------------------
# Assign each symbol an angle based on rank
df_top['rank'] = df_top.groupby('date')['pred_proba_hot'].rank(method='first')
df_top['theta'] = 2 * np.pi * df_top['rank'] / df_top['rank'].max()
df_top['radius'] = df_top['pred_proba_hot'] * 10  # scale radius by hot probability
df_top['x'] = df_top['radius'] * np.cos(df_top['theta'])
df_top['y'] = df_top['radius'] * np.sin(df_top['theta'])
df_top['z'] = df_top['log_marketCap']

# ----------------------------
# Build interactive 3D scatter
# ----------------------------
fig = go.Figure()

# Add trace per snapshot for animation
for t, frame in df_top.groupby('date'):
    fig.add_trace(go.Scatter3d(
        x=frame['x'],
        y=frame['y'],
        z=frame['z'],
        mode='markers',
        marker=dict(
            size=frame['log_volume'],
            color=frame['pred_proba_hot'],
            colorscale='Turbo',
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        name=str(t),
        text=frame['symbol'],
        hovertemplate="<b>%{text}</b><br>Hot: %{marker.color:.2f}<br>Volume: %{marker.size:.0f}<br>MarketCap: %{z:.2f}<extra></extra>"
    ))

# ----------------------------
# Layout
# ----------------------------
fig.update_layout(
    title="üåå Spiral Galaxy of Hot Stocks üåå",
    scene=dict(
        xaxis_title='X Spiral',
        yaxis_title='Y Spiral',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    updatemenus=[dict(type='buttons', showactive=False,
                      buttons=[dict(label='Play',
                                    method='animate',
                                    args=[None, {"frame": {"duration": 800, "redraw": True},
                                                 "fromcurrent": True, "transition": {"duration": 300}}]),
                               dict(label='Pause',
                                    method='animate',
                                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                                   "mode": "immediate",
                                                   "transition": {"duration": 0}}])])]
)

# ----------------------------
# Add animation frames
# ----------------------------
frames = [go.Frame(data=[go.Scatter3d(
    x=frame['x'],
    y=frame['y'],
    z=frame['z'],
    mode='markers',
    marker=dict(
        size=frame['log_volume'],
        color=frame['pred_proba_hot'],
        colorscale='Turbo',
        opacity=0.8,
        line=dict(width=0.5, color='white')
    ),
    text=frame['symbol'],
    hovertemplate="<b>%{text}</b><br>Hot: %{marker.color:.2f}<br>Volume: %{marker.size:.0f}<br>MarketCap: %{z:.2f}<extra></extra>"
)], name=str(t)) for t, frame in df_top.groupby('date')]

fig.frames = frames

chart_path = os.path.join(OUTPUT_DIR, f"3d_spiral_hot_stocks.html")
fig.write_html(chart_path, include_plotlyjs='cdn')