In [1]:
import os
import pandas as pd
import numpy as np
import glob
import re
from pathlib import Path

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

In [2]:
DATA_DAILY = Path("data/daily")
DATA_HOT_SCORE = Path("data/hotscore")
OUTPUT_DIR = Path("output/classification")

for p in (DATA_DAILY, DATA_HOT_SCORE, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [4]:

files = glob.glob(str(DATA_HOT_SCORE / "hotscore_*.csv"))

all_data = []

for file in files:
    df = pd.read_csv(file)

    date_match = re.search(r'hotscore_(\d{8})\.csv', file)
    if date_match:
        df['snapshot_date'] = pd.to_datetime(date_match.group(1), format='%Y%m%d')

    all_data.append(df)

score_data = pd.concat(all_data, ignore_index=True)
score_data = score_data.sort_values(['snapshot_date', 'symbol'])

print("Loaded rows:", score_data.shape)

Loaded rows: (699514, 12)


In [5]:
score_data['target_hot'] = (
    score_data.groupby('snapshot_date')['HotScore']
    .transform(lambda x: x >= x.quantile(0.85))
    .astype(int)
)


In [7]:
threshold = score_data['HotScore'].quantile(0.85)
score_data['target_hot'] = (score_data['HotScore'] >= threshold).astype(int)

score_data['target_hot'].value_counts(normalize=True)

features = [
    'regularMarketPrice',
    'regularMarketChangePercent',
    'VolumeSpike',
    'MomentumScore',
    'VolumeScore',
    'VolatilityScore',
    'TrendScore'
]
# removed TrendScore column
X = score_data[features]
y = score_data['target_hot']

In [8]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
 
tscv = TimeSeriesSplit(n_splits=5)

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # clean infinities / NaNs
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train.fillna(0, inplace=True)
    X_test.fillna(0, inplace=True)

    # optional log-transform for large features
    for col in ['marketCap', 'regularMarketVolume']:
        if col in X_train.columns:
            X_train[col] = np.log1p(X_train[col])
            X_test[col] = np.log1p(X_test[col])

    # train
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    print(f"Fold {fold} Classification Report:")
    print(classification_report(y_test, preds))
    auc = roc_auc_score(y_test, probs)
    print(f"Fold {fold} ROC AUC: {auc:.4f}\n")


Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    101191
           1       1.00      1.00      1.00     15394

    accuracy                           1.00    116585
   macro avg       1.00      1.00      1.00    116585
weighted avg       1.00      1.00      1.00    116585

Fold 1 ROC AUC: 1.0000

Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    101351
           1       1.00      1.00      1.00     15234

    accuracy                           1.00    116585
   macro avg       1.00      1.00      1.00    116585
weighted avg       1.00      1.00      1.00    116585

Fold 2 ROC AUC: 1.0000

Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    101526
           1       1.00      1.00      1.00     15059

    accuracy                           1.00 

In [9]:
# 1Ô∏è‚É£ Compute hot probabilities
score_data['hot_probability'] = model.predict_proba(X)[:,1]

display(score_data.head(5))

Unnamed: 0,symbol,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,snapshot_date,target_hot,hot_probability
0,AA,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,2026-01-17,0,9.292819e-08
50,AA,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,2026-01-17,0,9.292819e-08
100,AA,0.773989,0.557951,41.56,6.02041,1.039631,6727448.0,0.876011,0.749326,0.746631,10762550000.0,2026-01-17,0,5.246805e-08
150,AA,0.789218,0.571429,41.57,6.04592,1.136707,6727448.0,0.881402,0.778976,0.754717,10765130000.0,2026-01-17,0,6.185797e-08
200,AA,0.789218,0.571429,41.57,6.04592,1.136707,6727448.0,0.881402,0.778976,0.754717,10765130000.0,2026-01-17,0,6.185797e-08


In [13]:
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as colors 
import plotly.express as px

DARK_TEMPLATE = "plotly_dark"
PROB_RANGE = [0, 1]
CHART_HEIGHT = 700
MARGINS = dict(l=160, r=40, t=60, b=40)


In [16]:
top_symbols = (
    score_data.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(30)
    .index
)

df_chart = score_data[score_data['symbol'].isin(top_symbols)]

heatmap_df = df_chart.pivot_table(
    index='symbol',
    columns='snapshot_date',
    values='hot_probability',
    aggfunc='mean'
)

fig = px.imshow(
    heatmap_df,
    aspect="auto",
    color_continuous_scale="YlOrRd",
    template=DARK_TEMPLATE,
    title="üî• AI Probability Heatmap ‚Äî Cross-Sectional Strength"
)

fig.update_layout(
    height=800,
    margin=dict(l=180, r=40, t=80, b=60),
    coloraxis_colorbar=dict(title="Hot Prob")
)

fig.write_html(OUTPUT_DIR / "superchart-1.0.html", include_plotlyjs="cdn")


In [17]:

fig = go.Figure()

for sym in top_symbols[:20]:
    df_sym = df_chart[df_chart['symbol']==sym]
    fig.add_trace(go.Scatter(
        x=df_sym['snapshot_date'],
        y=df_sym['hot_probability'],
        mode='lines',
        name=sym,
        line=dict(width=2),
        hovertemplate=(
            "<b>%{fullData.name}</b><br>"
            "Date: %{x}<br>"
            "Hot Prob: %{y:.3f}<extra></extra>"
        )
    ))

fig.update_layout(
    title="üìà AI Hot Stocks ‚Äî Probability Evolution",
    template=DARK_TEMPLATE,
    height=750,
    yaxis=dict(range=PROB_RANGE, title="Hot Probability"),
    xaxis=dict(title="Date"),
    hovermode="x unified",
    margin=MARGINS,
    legend=dict(font=dict(size=10))
)

fig.write_html(OUTPUT_DIR / "superchart-2.0.html", include_plotlyjs="cdn")


In [19]:
import plotly.express as px

df_bar = (
    score_data
    .groupby('symbol', as_index=False)
    .agg(max_hot_probability=('hot_probability', 'max'))
    .sort_values('max_hot_probability', ascending=False)
    .head(20)
)

fig = px.bar(
    df_bar,
    x="max_hot_probability",
    y="symbol",
    orientation="h",
    color="max_hot_probability",
    color_continuous_scale="Turbo",
    title="üß† AI Hot Stocks ‚Äî Max Probability Ranking",
    template=DARK_TEMPLATE
)

fig.update_layout(
    xaxis=dict(range=PROB_RANGE, title="Hot Probability"),
    yaxis_title="Symbol",
    height=CHART_HEIGHT,
    margin=MARGINS,
    coloraxis_colorbar=dict(title="Hot Prob")
)

fig.write_html(
    OUTPUT_DIR / "superchart-3.0.html",
    include_plotlyjs="cdn"
)


In [18]:
latest_date = score_data['snapshot_date'].max()

df_latest = (
    score_data[score_data['snapshot_date']==latest_date]
    .sort_values('hot_probability', ascending=False)
    .head(15)
)

fig = px.bar(
    df_latest,
    x='symbol',
    y='hot_probability',
    color='hot_probability',
    color_continuous_scale="Turbo",
    title=f"üß† AI Hot Stocks ‚Äî Snapshot {latest_date.date()}",
    template=DARK_TEMPLATE
)

fig.update_layout(
    yaxis=dict(range=PROB_RANGE, title="Hot Probability"),
    xaxis_title="Symbol",
    height=750,
    margin=dict(l=120, r=40, t=80, b=60),
    coloraxis_colorbar=dict(title="Hot Prob")
)

fig.write_html(
    OUTPUT_DIR / "superchart-4.0.html",
    include_plotlyjs="cdn"
)


In [20]:
import plotly.graph_objects as go

pivot_df = (
    df_chart.pivot_table(
        index='snapshot_date',
        columns='symbol',
        values='hot_probability',
        aggfunc='mean'
    )
    .fillna(0)
    .sort_index()
)

fig = go.Figure()

for sym in pivot_df.columns[:15]:
    fig.add_trace(go.Scatter(
        x=pivot_df.index,
        y=pivot_df[sym],
        stackgroup='one',
        mode='lines',
        name=sym,
        line=dict(width=1.2),
        opacity=0.6,
        hovertemplate=(
            "<b>%{fullData.name}</b><br>"
            "Date: %{x}<br>"
            "Hot Prob: %{y:.3f}<extra></extra>"
        )
    ))

fig.update_layout(
    title="üåä AI Market Streamflow",
    template=DARK_TEMPLATE,
    yaxis=dict(range=PROB_RANGE, title="Relative Hot Probability"),
    xaxis=dict(title="Date"),
    hovermode="x unified",
    height=800,
    margin=dict(l=80, r=40, t=80, b=80),
    legend=dict(orientation="h", y=-0.2)
)

fig.write_html(OUTPUT_DIR / "superchart-5.0.html", include_plotlyjs="cdn")


In [21]:
import plotly.graph_objects as go
import os

df = score_data.copy()

# Use real time axis
df['snapshot_date'] = pd.to_datetime(df['snapshot_date'])

# Select top 15 symbols by peak probability
top_symbols = (
    df.groupby('symbol')['hot_probability']
      .max()
      .sort_values(ascending=False)
      .head(15)
      .index.tolist()
)

df = df[df['symbol'].isin(top_symbols)]

# Smooth probabilities (3-period rolling)
df['hot_smooth'] = (
    df.groupby('symbol')['hot_probability']
      .transform(lambda x: x.rolling(3, min_periods=1).mean())
)

# Map symbols to y-axis lanes
symbol_to_y = {sym: i for i, sym in enumerate(top_symbols)}
df['y_lane'] = df['symbol'].map(symbol_to_y)

fig = go.Figure()

# Ribbon markers + subtle connecting line
for sym in top_symbols:
    df_sym = df[df['symbol'] == sym].sort_values('snapshot_date')

    fig.add_trace(go.Scatter(
        x=df_sym['snapshot_date'],
        y=[symbol_to_y[sym]] * len(df_sym),
        mode='markers+lines',
        marker=dict(
            size=14,
            color=df_sym['hot_smooth'],
            colorscale='Turbo',
            cmin=0,
            cmax=1,
            showscale=(sym == top_symbols[-1]),  # one colorbar only
            colorbar=dict(
                title="Hot Probability",
                thickness=12
            )
        ),
        line=dict(
            width=2,
            color='rgba(255,255,255,0.08)'
        ),
        text=[sym] * len(df_sym),
        customdata=df_sym['hot_smooth'],
        hovertemplate=(
            "<b>%{text}</b><br>"
            "Date: %{x}<br>"
            "Hot Prob: %{customdata:.3f}"
            "<extra></extra>"
        ),
        name=sym,
        showlegend=False
    ))

# Highlight Top 3 per snapshot
top_n = 3

top_hits = (
    df.sort_values(['snapshot_date', 'hot_smooth'], ascending=[True, False])
      .groupby('snapshot_date')
      .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_hits['snapshot_date'],
    y=top_hits['y_lane'],
    mode='markers',
    marker=dict(
        size=18,
        color='rgba(255,215,0,0.95)',
        line=dict(width=2, color='orange'),
        symbol='star'
    ),
    text=top_hits['symbol'],
    customdata=top_hits['hot_smooth'],
    hovertemplate=(
        "üî• <b>%{text}</b><br>"
        "Date: %{x}<br>"
        "Prob: %{customdata:.3f}"
        "<extra></extra>"
    ),
    name='üî• Top AI Picks'
))

# Layout ‚Äî Maintain Your Style
fig.update_layout(
    title="üß† AI Probability Ribbon ‚Äî Cross-Sectional Dominance",
    template="plotly_dark",
    xaxis_title="Date",
    yaxis=dict(
        title="Symbols",
        tickmode='array',
        tickvals=list(symbol_to_y.values()),
        ticktext=list(symbol_to_y.keys()),
        automargin=True
    ),
    yaxis_range=[-1, len(top_symbols)],
    hovermode='closest',
    margin=dict(l=180, r=60, t=80, b=60),
    height=750
)

chart_path = os.path.join(OUTPUT_DIR, "superchart-6.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


In [22]:
importance = pd.Series(
model.feature_importances_,
index=features
).sort_values(ascending=False)

importance

VolumeScore                   0.303372
MomentumScore                 0.226652
VolatilityScore               0.210394
TrendScore                    0.118397
regularMarketChangePercent    0.055645
VolumeSpike                   0.052690
regularMarketPrice            0.032849
dtype: float32

In [23]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Bar(
    x=importance.values,
    y=importance.index,
    orientation='h',
    marker=dict(
        color=importance.values,
        colorscale='Viridis'
    ),
    hovertemplate=
        "<b>Feature:</b> %{y}<br>" +
        "<b>Importance:</b> %{x:.4f}<extra></extra>"
))

fig.update_layout(
    title="üß† AI Feature Importance ‚Äî What the Model Cares About",
    xaxis_title="Importance",
    yaxis_title="Feature",
    template="plotly_dark",
    height=600,
    margin=dict(l=180, r=40, t=60, b=40)
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_feature-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [24]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(y_test, preds)
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

labels = ["Not Hot", "Hot"]

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,
    y=labels,
    colorscale="Blues",
    hovertemplate=
        "Predicted: %{x}<br>" +
        "Actual: %{y}<br>" +
        "Count: %{z}<extra></extra>"
))

fig.update_layout(
    title=dict(
        text=(
            "ü§ñ Confusion Matrix ‚Äî Model Decisions<br><sup>"
            f"Accuracy: {accuracy:.3f} | "
            f"Precision: {precision:.3f} | "
            f"Recall: {recall:.3f} | "
            f"F1-score: {f1:.3f}"
            "</sup>"
        )
    ),
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    template="plotly_dark",
    height=800
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_confusion_matrix-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')

In [25]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=score_data['hot_probability'],
    nbinsx=40,
    marker=dict(color="#00f2ff"),
    hovertemplate=
        "Probability Bin: %{x}<br>" +
        "Count: %{y}<extra></extra>"
))

fig.update_layout(
    title="üìä AI Confidence Distribution ‚Äî Hot Probability",
    xaxis_title="Predicted Hot Probability",
    yaxis_title="Count",
    template="plotly_dark",
    height=500
)

chart_path = os.path.join(OUTPUT_DIR, f"ai_probability_distribution-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')