In [1]:
import os
import pandas as pd
import numpy as np
import re
from pathlib import Path 

In [2]:
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/logistic_regression")

for p in (DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [3]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file


In [4]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
df = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))
df.shape

(120300, 11)

In [5]:
display(df.head())

Unnamed: 0,symbol,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap
0,ACM,0.748669,0.83432,106.285,3.490756,1.012732,1506045.0,0.668639,0.784024,0.784024,14011910000.0
1,ADUS,0.796366,0.967419,109.4799,5.563492,0.786109,192519.0,0.924812,0.571429,0.879699,2023547000.0
2,ADUS,0.780318,0.95599,108.335,4.459551,0.954628,192519.0,0.853301,0.621027,0.843521,2002385000.0
3,AFRM,0.606071,0.464286,60.67,4.694507,0.179526,5662649.0,0.764286,0.471429,0.635714,20209820000.0
4,AFRM,0.801711,0.574144,62.03,7.040549,0.58522,5662649.0,0.91635,0.760456,0.787072,20662850000.0


In [9]:
df = df.copy()

# Target: HotScore > 0.85
df["is_hot"] = (df["HotScore"] > 0.85).astype(int)

# Clip values to avoid log(0) or negative values
df["marketCap"] = df["marketCap"].clip(lower=1)
df["averageDailyVolume3Month"] = df["averageDailyVolume3Month"].clip(lower=1)

# Log-transform
df["log_marketCap"] = np.log(df["marketCap"])
df["log_volume"] = np.log(df["averageDailyVolume3Month"])

# Features (exclude symbol, date, HotScore)
features = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "VolumeSpike",
    "log_marketCap",
    "log_volume"
]

X = df[features]
y = df["is_hot"]

# Keep symbol and date for final table
symbol_hotScore = df[["symbol", "HotScore"]]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, sym_train, sym_test = train_test_split(
    X, y, symbol_hotScore, test_size=0.25, random_state=42, stratify=y
)

In [11]:
has_inf = np.isinf(X_train.values).any()
has_nan = np.isnan(X_train.values).any()
needs_cleaning = has_inf or has_nan
print("Need cleaning?", needs_cleaning)

if needs_cleaning:
    print("Cleaning X_train and X_test...")
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Keep indexes aligned
    keep_train_idx = X_train.dropna().index
    X_train = X_train.loc[keep_train_idx]
    y_train = y_train.loc[keep_train_idx]

    keep_test_idx = X_test.dropna().index
    X_test = X_test.loc[keep_test_idx]
    y_test = y_test.loc[keep_test_idx]
    sym_test = sym_test.loc[keep_test_idx]


Need cleaning? False


In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

log_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)
log_model.fit(X_train_scaled, y_train)

In [14]:
y_proba = log_model.predict_proba(X_test_scaled)[:, 1]
y_pred = log_model.predict(X_test_scaled)

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

ROC AUC: 0.8973420293881036
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     18440
           1       0.79      0.76      0.77     11635

    accuracy                           0.83     30075
   macro avg       0.82      0.81      0.82     30075
weighted avg       0.82      0.83      0.83     30075



In [15]:
coef_df = pd.DataFrame({
    "feature": features,
    "coefficient": log_model.coef_[0]
}).sort_values("coefficient", ascending=False)

display(coef_df)

Unnamed: 0,feature,coefficient
1,regularMarketChangePercent,2.000064
3,log_marketCap,0.97881
2,VolumeSpike,0.603338
0,regularMarketPrice,0.512197
4,log_volume,-0.21311


In [16]:
results = X_test.copy()
results["symbol"] = sym_test["symbol"].values
results["pred_proba_hot"] = y_proba
results["pred_hot"] = y_pred
results["true_hot"] = y_test.values

# Sort by predicted probability descending
results_sorted = results.sort_values("pred_proba_hot", ascending=False)

# Show top 5 predicted hottest stocks
display(results_sorted.head(5))

Unnamed: 0,regularMarketPrice,regularMarketChangePercent,VolumeSpike,log_marketCap,log_volume,symbol,pred_proba_hot,pred_hot,true_hot
120138,37.508,28.320219,6.389348,21.988665,14.169041,TDC,0.999998,1,1
120293,248.51,24.4915,3.184867,25.277299,15.640263,VRT,0.999997,1,1
120292,248.51,24.4915,3.077002,25.277299,15.640263,VRT,0.999997,1,1
120049,79.93,29.54619,2.126834,22.034882,12.778475,DIOD,0.999996,1,1
120194,67.095,24.296038,4.175055,23.398716,14.887448,BWA,0.999984,1,1


In [17]:
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as colors 
import plotly.express as px

In [18]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(50).index
df_top = results_sorted[results_sorted['symbol'].isin(top_symbols)]

In [20]:
heatmap_df = df_top.pivot(index='symbol', columns='regularMarketPrice', values='pred_proba_hot')

fig_heatmap = px.imshow(
    heatmap_df,
    labels=dict(x="HotScore", y="Symbol", color="Predicted Hot Probability"),
    color_continuous_scale="YlOrRd",
    text_auto=False,
    aspect="auto"
)

fig_heatmap.update_layout(
    title="Predicted Hot-Stock Probability Heatmap",
    xaxis_nticks=20,
    yaxis={'categoryorder':'total ascending'},
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"predicted_hot_probability.html")
fig_heatmap.write_html(chart_path, include_plotlyjs='cdn')


ValueError: Index contains duplicate entries, cannot reshape

In [None]:
fig_bubble = px.scatter(
    df_top.head(100),  # top 100 predicted hot stocks
    x="VolumeSpike",
    y="regularMarketChangePercent",
    size="log_volume",
    color="pred_proba_hot",
    hover_name="symbol",
    hover_data=["date", "regularMarketPrice", "log_marketCap"],
    color_continuous_scale="YlOrRd",
    size_max=25
)

fig_bubble.update_layout(
    title="Hot Stocks Feature Space",
    xaxis_title="Volume Spike",
    yaxis_title="Price Change %",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"hot_stocks_feature_space.html")
fig_bubble.write_html(chart_path, include_plotlyjs='cdn')

In [None]:
top20 = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(20).reset_index()
fig_bar = px.bar(
    top20,
    x='symbol',
    y='pred_proba_hot',
    color='pred_proba_hot',
    color_continuous_scale='YlOrRd',
    text='pred_proba_hot'
)

fig_bar.update_layout(
    title="Top 20 Predicted Hot Stocks",
    xaxis_title="Symbol",
    yaxis_title="Predicted Probability",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"top20_predicted_hot_stocks.html")
fig_bar.write_html(chart_path, include_plotlyjs='cdn')

In [None]:
# Example using market cap bins
results_sorted['marketCap_bin'] = pd.qcut(results_sorted['log_marketCap'], 4, labels=['Small','Mid','Large','Mega'])

fig_tree = px.treemap(
    results_sorted.head(100),
    path=['marketCap_bin','symbol'],
    values='pred_proba_hot',
    color='pred_proba_hot',
    color_continuous_scale='YlOrRd',
)

fig_tree.update_layout(
    title="Treemap of Top 100 Hot Stocks by Market Cap",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"treemap_hot_stocks.html")
fig_tree.write_html(chart_path, include_plotlyjs='cdn')


In [None]:
fig_3d = px.scatter_3d(
    results_sorted.head(100),
    x="VolumeSpike",
    y="regularMarketChangePercent",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    color_continuous_scale='YlOrRd',
    size_max=20
)

fig_3d.update_layout(
    title="3D Hot Stock Feature Space",
    scene=dict(
        xaxis_title='VolumeSpike',
        yaxis_title='Price Change %',
        zaxis_title='log(MarketCap)'
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

chart_path = os.path.join(OUTPUT_DIR, f"3d_hot_stocks.html")
fig_3d.write_html(chart_path, include_plotlyjs='cdn')


In [None]:
# Take top 200 predicted hot stocks for performance & cool effect
df_top3d = results_sorted.sort_values("pred_proba_hot", ascending=False).head(200)

fig_3d_cool = px.scatter_3d(
    df_top3d,
    x="regularMarketChangePercent",
    y="VolumeSpike",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    hover_data=["date", "regularMarketPrice"],
    color_continuous_scale="Turbo",  # vibrant neon-style colors
    size_max=20
)

# Dark theme & style adjustments
fig_3d_cool.update_layout(
    title="üî• 3D Galaxy of Hot Stocks üî•",
    scene=dict(
        xaxis_title='Price Change %',
        yaxis_title='Volume Spike',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

# Optional: make markers ‚Äúglow‚Äù by adjusting opacity
fig_3d_cool.update_traces(marker=dict(opacity=0.9, line=dict(width=0.5, color='white')))

chart_path = os.path.join(OUTPUT_DIR, f"3d_galaxy_hot_stocks.html")
fig_3d_cool.write_html(chart_path, include_plotlyjs='cdn')



In [None]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(100).index
df_ani = results_sorted[results_sorted['symbol'].isin(top_symbols)].copy()

# Ensure date is sorted
df_ani['date'] = pd.to_datetime(df_ani['date'])
df_ani = df_ani.sort_values('date')

# ----------------------------
# 2Ô∏è‚É£ Animated 3D scatter (galaxy)
# ----------------------------
fig_3d_anim = px.scatter_3d(
    df_ani,
    x="regularMarketChangePercent",
    y="VolumeSpike",
    z="log_marketCap",
    color="pred_proba_hot",
    size="log_volume",
    hover_name="symbol",
    hover_data=["regularMarketPrice", "log_marketCap"],
    animation_frame=df_ani['date'].dt.strftime('%Y-%m-%d %H:%M:%S'),  # animate by snapshot
    color_continuous_scale="Turbo",
    size_max=25
)

# ----------------------------
# 3Ô∏è‚É£ Style layout for dark theme
# ----------------------------
fig_3d_anim.update_layout(
    title="üåå Animated 3D Galaxy of Hot Stocks üåå",
    scene=dict(
        xaxis_title='Price Change %',
        yaxis_title='Volume Spike',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)

# Optional: slightly transparent markers for galaxy effect
fig_3d_anim.update_traces(marker=dict(opacity=0.8, line=dict(width=0.5, color='white')))

chart_path = os.path.join(OUTPUT_DIR, f"3d_animated_hot_stocks.html")
fig_3d_anim.write_html(chart_path, include_plotlyjs='cdn')

In [None]:
top_symbols = results_sorted.groupby('symbol')['pred_proba_hot'].max().sort_values(ascending=False).head(100).index
df_top = results_sorted[results_sorted['symbol'].isin(top_symbols)].copy()
df_top['date'] = pd.to_datetime(df_top['date'])

# ----------------------------
# Create spiral coordinates
# ----------------------------
# Assign each symbol an angle based on rank
df_top['rank'] = df_top.groupby('date')['pred_proba_hot'].rank(method='first')
df_top['theta'] = 2 * np.pi * df_top['rank'] / df_top['rank'].max()
df_top['radius'] = df_top['pred_proba_hot'] * 10  # scale radius by hot probability
df_top['x'] = df_top['radius'] * np.cos(df_top['theta'])
df_top['y'] = df_top['radius'] * np.sin(df_top['theta'])
df_top['z'] = df_top['log_marketCap']

# ----------------------------
# Build interactive 3D scatter
# ----------------------------
fig = go.Figure()

# Add trace per snapshot for animation
for t, frame in df_top.groupby('date'):
    fig.add_trace(go.Scatter3d(
        x=frame['x'],
        y=frame['y'],
        z=frame['z'],
        mode='markers',
        marker=dict(
            size=frame['log_volume'],
            color=frame['pred_proba_hot'],
            colorscale='Turbo',
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        name=str(t),
        text=frame['symbol'],
        hovertemplate="<b>%{text}</b><br>Hot: %{marker.color:.2f}<br>Volume: %{marker.size:.0f}<br>MarketCap: %{z:.2f}<extra></extra>"
    ))

# ----------------------------
# Layout
# ----------------------------
fig.update_layout(
    title="üåå Spiral Galaxy of Hot Stocks üåå",
    scene=dict(
        xaxis_title='X Spiral',
        yaxis_title='Y Spiral',
        zaxis_title='log(MarketCap)',
        xaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        yaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
        zaxis=dict(backgroundcolor="black", gridcolor="gray", showbackground=True),
    ),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    updatemenus=[dict(type='buttons', showactive=False,
                      buttons=[dict(label='Play',
                                    method='animate',
                                    args=[None, {"frame": {"duration": 800, "redraw": True},
                                                 "fromcurrent": True, "transition": {"duration": 300}}]),
                               dict(label='Pause',
                                    method='animate',
                                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                                   "mode": "immediate",
                                                   "transition": {"duration": 0}}])])]
)

# ----------------------------
# Add animation frames
# ----------------------------
frames = [go.Frame(data=[go.Scatter3d(
    x=frame['x'],
    y=frame['y'],
    z=frame['z'],
    mode='markers',
    marker=dict(
        size=frame['log_volume'],
        color=frame['pred_proba_hot'],
        colorscale='Turbo',
        opacity=0.8,
        line=dict(width=0.5, color='white')
    ),
    text=frame['symbol'],
    hovertemplate="<b>%{text}</b><br>Hot: %{marker.color:.2f}<br>Volume: %{marker.size:.0f}<br>MarketCap: %{z:.2f}<extra></extra>"
)], name=str(t)) for t, frame in df_top.groupby('date')]

fig.frames = frames

chart_path = os.path.join(OUTPUT_DIR, f"3d_spiral_hot_stocks.html")
fig.write_html(chart_path, include_plotlyjs='cdn')