In [1]:
import os
import pandas as pd
import numpy as np
import glob
import re
from pathlib import Path

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

In [2]:
DATA_DAILY = Path("data/daily")
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/classification")

for p in (DATA_DAILY, DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [4]:
def latest_file_in_directory(directory=DATA_HOT_SCORE):
    latest_file = max(
        f for f in os.listdir(directory)
        if f.startswith("hotscore_") and f.endswith(".csv")
    )
    return latest_file

In [16]:
latest_file = latest_file_in_directory(DATA_HOT_SCORE)
score_data = pd.read_csv(os.path.join(DATA_HOT_SCORE, latest_file))

print(score_data.shape)

(45622, 12)


In [17]:
score_data.head(5)

Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0
1,AAUC,2025-11-26 20:46:26,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0
2,ALAB,2025-11-26 20:46:26,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0
3,ANF,2025-11-26 20:46:26,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0
4,ARWR,2025-11-26 20:46:26,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0


In [None]:
threshold = score_data['HotScore'].quantile(0.85)
score_data['target_hot'] = (score_data['HotScore'] >= threshold).astype(int)

score_data['target_hot'].value_counts(normalize=True)

features = [
    'regularMarketPrice',
    'regularMarketChangePercent',
    'VolumeSpike',
    'MomentumScore',
    'VolumeScore',
    'VolatilityScore',
    'TrendScore'
]
# removed TrendScore column
X = score_data[features]
y = score_data['target_hot']

In [19]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
 
tscv = TimeSeriesSplit(n_splits=5)

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # clean infinities / NaNs
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train.fillna(0, inplace=True)
    X_test.fillna(0, inplace=True)

    # optional log-transform for large features
    for col in ['marketCap', 'regularMarketVolume']:
        if col in X_train.columns:
            X_train[col] = np.log1p(X_train[col])
            X_test[col] = np.log1p(X_test[col])

    # train
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    print(f"Fold {fold} Classification Report:")
    print(classification_report(y_test, preds))
    auc = roc_auc_score(y_test, probs)
    print(f"Fold {fold} ROC AUC: {auc:.4f}\n")


Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      6628
           1       0.88      0.90      0.89       975

    accuracy                           0.97      7603
   macro avg       0.93      0.94      0.94      7603
weighted avg       0.97      0.97      0.97      7603

Fold 1 ROC AUC: 0.9944

Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6794
           1       0.98      0.90      0.94       809

    accuracy                           0.99      7603
   macro avg       0.98      0.95      0.97      7603
weighted avg       0.99      0.99      0.99      7603

Fold 2 ROC AUC: 0.9994

Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6937
           1       0.96      0.68      0.79       666

    accuracy                           0.97 

In [23]:
# 1Ô∏è‚É£ Compute hot probabilities
score_data['hot_probability'] = model.predict_proba(X)[:, 1]
display(score_data.head(5))

Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,target_hot,hot_probability
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,0,5.274221e-07
1,AAUC,2025-11-26 20:46:26,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0,0,9.081012e-06
2,ALAB,2025-11-26 20:46:26,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0,0,4.869898e-05
3,ANF,2025-11-26 20:46:26,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0,0,0.0102378
4,ARWR,2025-11-26 20:46:26,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0,1,0.9998713


In [26]:
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as colors 
import plotly.express as px

In [None]:
# Optional: pick top 30 symbols by max probability for readability
top_symbols = (
    score_data.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(30)
    .index
)

df_chart = score_data[score_data['symbol'].isin(top_symbols)]

heatmap_df = df_chart.pivot(index='symbol', columns='date', values='hot_probability')

# Plot interactive heatmap
fig = px.imshow(
    heatmap_df,
    labels=dict(x="date", y="Symbol", color="Hot Probability"),
    aspect="auto",
    color_continuous_scale="YlOrRd",
    text_auto=True
)

fig.update_layout(
    title="AI Hot-Stock Probability Heatmap",
    xaxis_nticks=20,
    yaxis={'categoryorder':'total ascending'}
)


chart_path = os.path.join(OUTPUT_DIR, f"superchart-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


In [28]:
# Select top 20 symbols by max probability for clarity
top_symbols = (
    score_data.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(20)
    .index
)

df_chart = score_data[score_data['symbol'].isin(top_symbols)]

# Create figure
fig = go.Figure()

# Add a line for each symbol
for symbol in top_symbols:
    df_sym = df_chart[df_chart['symbol'] == symbol].sort_values('date')
    fig.add_trace(go.Scatter(
        x=df_sym['date'],
        y=df_sym['hot_probability'],
        mode='lines+markers',
        name=symbol,
        line=dict(width=2),
        marker=dict(size=6),
        hovertemplate='Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
        text=[symbol]*len(df_sym)
    ))

# Highlight top-N per snapshot (e.g., top 3)
top_n = 3
top_per_snapshot = (
    df_chart.sort_values(['date', 'hot_probability'], ascending=[True, False])
            .groupby('date')
            .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_per_snapshot['date'],
    y=top_per_snapshot['hot_probability'],
    mode='markers',
    marker=dict(size=10, color='red', symbol='star'),
    name='Top Hot',
    hovertemplate='Top Hot Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
    text=top_per_snapshot['symbol']
))

# Layout
fig.update_layout(
    title="AI Hot-Stock Probabilities Over Time",
    xaxis_title="Snapshot Time",
    yaxis_title="Hot Probability",
    yaxis=dict(range=[0, 1]),
    hovermode='closest',
    legend_title="Symbols",
    template="plotly_dark",
    legend=dict(
        x=-0.15,
        y=1,
        xanchor="left",
        yanchor="top"
    )
)


chart_path = os.path.join(OUTPUT_DIR, f"superchart-2.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [29]:
import plotly.express as px

df_bar = (
    score_data
    .groupby('symbol', as_index=False)
    .agg(max_hot_probability=('hot_probability', 'max'))
    .sort_values('max_hot_probability', ascending=False)
    .head(20)
)

fig = px.bar(
    df_bar,
    x="max_hot_probability",
    y="symbol",
    orientation="h",
    color="max_hot_probability",
    color_continuous_scale="Turbo",
    title="üß† AI Hot Stocks ‚Äî Max Probability Ranking",
    template="plotly_dark"
)

fig.update_layout(
    xaxis=dict(range=[0, 1]),
    height=700,
    margin=dict(l=160, r=40, t=60, b=40),
    coloraxis_colorbar=dict(title="Hot Prob")
)

fig.write_html(
    os.path.join(OUTPUT_DIR, "superchart-3.0.html"),
    include_plotlyjs="cdn"
)


In [30]:
import plotly.express as px


# Take ONE snapshot at a time (latest or selectable)
latest_snapshot = score_data['date'].max()

df_snapshot = (
    score_data[score_data['date'] == latest_snapshot]
    .sort_values('hot_probability', ascending=False)
    .head(15)
)

fig = px.bar(
    df_snapshot,
    x='symbol',
    y='hot_probability',
    color='hot_probability',
    color_continuous_scale='Turbo',
    title=f"üß† AI Hot Stocks ‚Äî Snapshot {latest_snapshot}",
    template='plotly_dark'
)

fig.update_layout(
    yaxis=dict(range=[0,1]),
    xaxis_title="Symbol",
    yaxis_title="Hot Probability",
    height=700
)

fig.write_html(
    os.path.join(OUTPUT_DIR, "superchart-4.0.html"),
    include_plotlyjs="cdn"
)


In [32]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import os

df_chart = score_data.copy()
df_chart['date'] = pd.to_datetime(df_chart['date'])

# Top 15 symbols by peak probability
top_symbols = (
    df_chart.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(15)
    .index
)

df_chart = df_chart[df_chart['symbol'].isin(top_symbols)]

# Smooth probabilities
df_chart['hot_probability_smooth'] = (
    df_chart.groupby('symbol')['hot_probability']
    .transform(lambda x: x.rolling(3, min_periods=1).mean())
)

# Pivot for stream chart
pivot_df = (
    df_chart.pivot(
        index='date',
        columns='symbol',
        values='hot_probability_smooth'
    )
    .fillna(0)
    .sort_index()
)

pivot_df = pivot_df[top_symbols]

# Plotly-native color palette (modern)
colors = px.colors.qualitative.Bold
color_map = {sym: colors[i % len(colors)] for i, sym in enumerate(top_symbols)}

fig = go.Figure()

# Stream layers
for sym in top_symbols:
    fig.add_trace(go.Scatter(
        x=pivot_df.index,
        y=pivot_df[sym],
        stackgroup='one',
        mode='lines',
        name=sym,
        line=dict(
            width=1.2,
            color=color_map[sym]
        ),
        fillcolor=color_map[sym],
        opacity=0.55,
        hovertemplate=(
            "<b>%{fullData.name}</b><br>"
            "Time: %{x}<br>"
            "Hot Prob: %{y:.3f}<extra></extra>"
        )
    ))

# Highlight top-3 per snapshot
top_n = 3
top_per_snapshot = (
    df_chart.sort_values(
        ['date', 'hot_probability_smooth'],
        ascending=[True, False]
    )
    .groupby('date')
    .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_per_snapshot['date'],
    y=top_per_snapshot['hot_probability_smooth'],
    mode='markers',
    name='üî• Top AI Picks',
    marker=dict(
        size=12,
        color='gold',
        symbol='star',
        line=dict(width=1.5, color='#ff9900')
    ),
    hovertemplate=(
        "<b>üî• %{text}</b><br>"
        "Time: %{x}<br>"
        "Prob: %{y:.3f}<extra></extra>"
    ),
    text=top_per_snapshot['symbol']
))

# Layout ‚Äî MODERN
fig.update_layout(
    title="üß† AI Market Streamflow",
    template="plotly_dark",
    hovermode="x unified",
    yaxis=dict(
        title="Relative Hot Probability",
        showgrid=False,
        zeroline=False
    ),
    xaxis=dict(
        title="Date",
        showgrid=False
    ),
    legend=dict(
        orientation="h",
        y=-0.2,
        x=0,
        font=dict(size=11)
    ),
    margin=dict(l=40, r=40, t=80, b=80),
    height=720
)

chart_path = os.path.join(OUTPUT_DIR, "superchart-5.0.html")
fig.write_html(chart_path, include_plotlyjs="cdn")

In [33]:
import plotly.graph_objects as go

df = score_data.copy()
df['date'] = pd.to_datetime(df['date'])

# Select top 15 symbols by max probability
top_symbols = (
    df.groupby('symbol')['hot_probability']
      .max()
      .sort_values(ascending=False)
      .head(15)
      .index.tolist()
)
df = df[df['symbol'].isin(top_symbols)]

# Smooth probabilities
df['hot_smooth'] = df.groupby('symbol')['hot_probability'] \
                     .transform(lambda x: x.rolling(3, min_periods=1).mean())

# Map symbols to y-axis lanes
symbol_to_y = {sym: i for i, sym in enumerate(top_symbols)}
df['y_lane'] = df['symbol'].map(symbol_to_y)

# Create figure
fig = go.Figure()

# Plot ribbon markers + subtle connecting line per symbol
for sym in top_symbols:
    df_sym = df[df['symbol'] == sym].sort_values('date')
    fig.add_trace(go.Scatter(
        x=df_sym['date'],
        y=[symbol_to_y[sym]]*len(df_sym),  # lane
        mode='markers+lines',
        marker=dict(
            size=14,
            color=df_sym['hot_smooth'],       # numeric array
            colorscale='Turbo',
            cmin=0,
            cmax=1,
            showscale=(sym == top_symbols[-1]),  # only show one colorbar
            colorbar=dict(title="Hot Probability", thickness=12)
        ),
        line=dict(width=2, color='rgba(255,255,255,0.1)'),  # subtle line
        text=[sym]*len(df_sym),
        customdata=df_sym['hot_smooth'],
        hovertemplate='Symbol: %{text}<br>Time: %{x}<br>Prob: %{customdata:.3f}',
        name=sym,
        showlegend=False
    ))

# Highlight top 3 hot per snapshot
top_n = 3
top_hits = (
    df.sort_values(['date','hot_smooth'], ascending=[True,False])
      .groupby('date')
      .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_hits['date'],
    y=top_hits['y_lane'],
    mode='markers',
    marker=dict(
        size=18,
        color='rgba(255,215,0,0.9)',
        line=dict(width=2, color='orange'),
        symbol='star'
    ),
    text=top_hits['symbol'],
    customdata=top_hits['hot_smooth'],
    hovertemplate='üî• Top Hot<br>Symbol: %{text}<br>Prob: %{customdata:.3f}',
    name='Top Hot'
))

# Layout
fig.update_layout(
    title="AI Probability Ribbon",
    xaxis_title="Date",
    yaxis=dict(
        title="Symbols",
        tickmode='array',
        tickvals=list(symbol_to_y.values()),
        ticktext=list(symbol_to_y.keys()),
        automargin=True
    ),
    yaxis_range=[-1, len(top_symbols)],
    hovermode='closest',
    template='plotly_dark',
    margin=dict(l=180, r=60, t=60, b=40)
)

chart_path = os.path.join(OUTPUT_DIR, f"superchart-6.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [35]:
importance = pd.Series(
model.feature_importances_,
index=features
).sort_values(ascending=False)

importance

VolumeScore                   0.312086
MomentumScore                 0.240701
VolatilityScore               0.195399
TrendScore                    0.116529
regularMarketChangePercent    0.060130
VolumeSpike                   0.042544
regularMarketPrice            0.032611
dtype: float32

In [36]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Bar(
    x=importance.values,
    y=importance.index,
    orientation='h',
    marker=dict(
        color=importance.values,
        colorscale='Viridis'
    ),
    hovertemplate=
        "<b>Feature:</b> %{y}<br>" +
        "<b>Importance:</b> %{x:.4f}<extra></extra>"
))

fig.update_layout(
    title="üß† AI Feature Importance ‚Äî What the Model Cares About",
    xaxis_title="Importance",
    yaxis_title="Feature",
    template="plotly_dark",
    height=600,
    margin=dict(l=180, r=40, t=60, b=40)
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_feature-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [37]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(y_test, preds)
labels = ["Not Hot", "Hot"]

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,
    y=labels,
    colorscale="Blues",
    hovertemplate=
        "Predicted: %{x}<br>" +
        "Actual: %{y}<br>" +
        "Count: %{z}<extra></extra>"
))

fig.update_layout(
    title="ü§ñ Confusion Matrix ‚Äî Model Decisions",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    template="plotly_dark",
    height=500
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_confusion_matrix-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [38]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=score_data['hot_probability'],
    nbinsx=40,
    marker=dict(color="#00f2ff"),
    hovertemplate=
        "Probability Bin: %{x}<br>" +
        "Count: %{y}<extra></extra>"
))

fig.update_layout(
    title="üìä AI Confidence Distribution ‚Äî Hot Probability",
    xaxis_title="Predicted Hot Probability",
    yaxis_title="Count",
    template="plotly_dark",
    height=500
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_probability_distribution-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')