In [52]:
import os
import pandas as pd
import numpy as np
import glob
import re
from pathlib import Path

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

In [4]:
DATA_DAILY = Path("data/daily")
HOT_SCORE_OUT = Path("data/hotscore")
OUTPUT_DIR = Path("output/classification")

for p in (DATA_DAILY, HOT_SCORE_OUT, OUTPUT_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [12]:
def extract_timestamp(filename):
    ts = re.search(r'_(\d{14})', filename).group(1)
    return pd.to_datetime(ts, format='%Y%m%d%H%M%S')

In [14]:
hot_files = sorted(glob.glob(str(DATA_DAILY / "hot_stocks_*.csv")))

hot_dfs = []
for f in hot_files:
    df = pd.read_csv(f)
    df['snapshot_time'] = extract_timestamp(f)
    hot_dfs.append(df)

hot_data = pd.concat(hot_dfs, ignore_index=True)

print(hot_data.shape)
hot_data.head()

(35711, 13)


Unnamed: 0,symbol,regularMarketPrice,regularMarketChangePercent,regularMarketVolume,averageDailyVolume3Month,marketCap,VolumeSpike,MomentumScore,VolumeScore,VolatilityScore,TrendScore,HotScore,snapshot_time
0,URBN,77.07,12.823895,8497781.0,2098090.0,6913018000.0,4.050246,0.986979,0.994792,0.921875,0.804688,0.958464,2025-11-26 20:46:26
1,SYM,87.51,13.222928,5076274.0,2168859.0,51875260000.0,2.340527,0.989583,0.973958,0.940104,0.820312,0.957292,2025-11-26 20:46:26
2,ARWR,58.675,25.400724,5782060.0,2311350.0,8112262000.0,2.501594,1.0,0.976562,0.955729,0.734375,0.95638,2025-11-26 20:46:26
3,DUOL,188.24,7.100598,2265762.0,1926600.0,8701837000.0,1.176042,0.919271,0.880208,0.960938,0.901042,0.912109,2025-11-26 20:46:26
4,VERA,33.48,13.761466,2616403.0,1539957.0,2140312000.0,1.69901,0.994792,0.942708,0.807292,0.697917,0.909375,2025-11-26 20:46:26


In [13]:
score_files = sorted(glob.glob(str(HOT_SCORE_OUT / "hotscore_*.csv")))

score_dfs = []
for f in score_files:
    df = pd.read_csv(f)
    df['score_time'] = extract_timestamp(f)
    score_dfs.append(df)


score_data = pd.concat(score_dfs, ignore_index=True)


print(score_data.shape)
score_data.head()

(9266476, 9)


Unnamed: 0,symbol,date,HotScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,marketCap,score_time
0,AA,2025-11-26 20:46:26,0.794401,41.845,6.74745,0.940394,6727448.0,10836350000.0,2025-11-26 20:46:38
1,AAUC,2025-11-26 20:46:26,0.846094,19.18,9.788214,1.052893,342331.0,2378320000.0,2025-11-26 20:46:38
2,ALAB,2025-11-26 20:46:26,0.773307,156.16,7.860205,0.638383,6266829.0,26375990000.0,2025-11-26 20:46:38
3,ANF,2025-11-26 20:46:26,0.905599,96.205,6.610155,1.88349,2070173.0,4583525000.0,2025-11-26 20:46:38
4,ARWR,2025-11-26 20:46:26,0.95638,58.675,25.400724,2.501594,2311350.0,8112262000.0,2025-11-26 20:46:38


In [17]:
hot_data = hot_data.sort_values(['snapshot_time', 'symbol']).reset_index(drop=True)
score_data = score_data.sort_values(['score_time', 'symbol']).reset_index(drop=True)

merged = pd.merge_asof(
    hot_data,
    score_data,
    by='symbol',
    left_on='snapshot_time',
    right_on='score_time',
    direction='forward'
)


merged.head()

Unnamed: 0,symbol,regularMarketPrice_x,regularMarketChangePercent_x,regularMarketVolume,averageDailyVolume3Month_x,marketCap_x,VolumeSpike_x,MomentumScore,VolumeScore,VolatilityScore,...,HotScore_x,snapshot_time,date,HotScore_y,regularMarketPrice_y,regularMarketChangePercent_y,VolumeSpike_y,averageDailyVolume3Month_y,marketCap_y,score_time
0,AA,41.845,6.74745,6326454.0,6727448.0,10836350000.0,0.940394,0.903646,0.802083,0.726562,...,0.794401,2025-11-26 20:46:26,2025-11-26 20:46:26,0.794401,41.845,6.74745,0.940394,6727448.0,10836350000.0,2025-11-26 20:46:38
1,AAUC,19.18,9.788214,360438.0,342331.0,2378320000.0,1.052893,0.96875,0.848958,0.622396,...,0.846094,2025-11-26 20:46:26,2025-11-26 20:46:26,0.846094,19.18,9.788214,1.052893,342331.0,2378320000.0,2025-11-26 20:46:38
2,ALAB,156.16,7.860205,4000639.0,6266829.0,26375990000.0,0.638383,0.942708,0.518229,0.947917,...,0.773307,2025-11-26 20:46:26,2025-11-26 20:46:26,0.773307,156.16,7.860205,0.638383,6266829.0,26375990000.0,2025-11-26 20:46:38
3,ANF,96.205,6.610155,3899150.0,2070173.0,4583525000.0,1.88349,0.898438,0.958333,0.864583,...,0.905599,2025-11-26 20:46:26,2025-11-26 20:46:26,0.905599,96.205,6.610155,1.88349,2070173.0,4583525000.0,2025-11-26 20:46:38
4,ARWR,58.675,25.400724,5782060.0,2311350.0,8112262000.0,2.501594,1.0,0.976562,0.955729,...,0.95638,2025-11-26 20:46:26,2025-11-26 20:46:26,0.95638,58.675,25.400724,2.501594,2311350.0,8112262000.0,2025-11-26 20:46:38


In [18]:
merged.columns.tolist()


['symbol',
 'regularMarketPrice_x',
 'regularMarketChangePercent_x',
 'regularMarketVolume',
 'averageDailyVolume3Month_x',
 'marketCap_x',
 'VolumeSpike_x',
 'MomentumScore',
 'VolumeScore',
 'VolatilityScore',
 'TrendScore',
 'HotScore_x',
 'snapshot_time',
 'date',
 'HotScore_y',
 'regularMarketPrice_y',
 'regularMarketChangePercent_y',
 'VolumeSpike_y',
 'averageDailyVolume3Month_y',
 'marketCap_y',
 'score_time']

In [19]:
# 1. Rename snapshot feature columns (_x → clean names)
merged = merged.rename(columns={
    'regularMarketPrice_x': 'regularMarketPrice',
    'regularMarketChangePercent_x': 'regularMarketChangePercent',
    'VolumeSpike_x': 'VolumeSpike',
    'averageDailyVolume3Month_x': 'averageDailyVolume3Month',
    'marketCap_x': 'marketCap',
    'HotScore_y': 'HotScore_future'
})

# 2. Drop leaky / duplicated columns
drop_cols = [
    'HotScore_x',
    'regularMarketPrice_y',
    'regularMarketChangePercent_y',
    'VolumeSpike_y',
    'averageDailyVolume3Month_y',
    'marketCap_y',
    'date'
]

merged = merged.drop(columns=[c for c in drop_cols if c in merged.columns])

# 3. Drop rows without a future label
merged = merged.dropna(subset=['HotScore_future'])
merged.head()

Unnamed: 0,symbol,regularMarketPrice,regularMarketChangePercent,regularMarketVolume,averageDailyVolume3Month,marketCap,VolumeSpike,MomentumScore,VolumeScore,VolatilityScore,TrendScore,snapshot_time,HotScore_future,score_time
0,AA,41.845,6.74745,6326454.0,6727448.0,10836350000.0,0.940394,0.903646,0.802083,0.726562,0.520833,2025-11-26 20:46:26,0.794401,2025-11-26 20:46:38
1,AAUC,19.18,9.788214,360438.0,342331.0,2378320000.0,1.052893,0.96875,0.848958,0.622396,0.854167,2025-11-26 20:46:26,0.846094,2025-11-26 20:46:38
2,ALAB,156.16,7.860205,4000639.0,6266829.0,26375990000.0,0.638383,0.942708,0.518229,0.947917,0.723958,2025-11-26 20:46:26,0.773307,2025-11-26 20:46:38
3,ANF,96.205,6.610155,3899150.0,2070173.0,4583525000.0,1.88349,0.898438,0.958333,0.864583,0.828125,2025-11-26 20:46:26,0.905599,2025-11-26 20:46:38
4,ARWR,58.675,25.400724,5782060.0,2311350.0,8112262000.0,2.501594,1.0,0.976562,0.955729,0.734375,2025-11-26 20:46:26,0.95638,2025-11-26 20:46:38


In [24]:
threshold = merged['HotScore_future'].quantile(0.90)
merged['target_hot'] = (merged['HotScore_future'] >= threshold).astype(int)

merged['target_hot'].value_counts(normalize=True)

features = [
    'regularMarketPrice',
    'regularMarketChangePercent',
    'VolumeSpike',
    'MomentumScore',
    'VolumeScore',
    'VolatilityScore',
    'TrendScore'
]

X = merged[features]
y = merged['target_hot']


In [29]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
 
tscv = TimeSeriesSplit(n_splits=5)

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # clean infinities / NaNs
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train.fillna(0, inplace=True)
    X_test.fillna(0, inplace=True)

    # optional log-transform for large features
    for col in ['marketCap', 'regularMarketVolume']:
        if col in X_train.columns:
            X_train[col] = np.log1p(X_train[col])
            X_test[col] = np.log1p(X_test[col])

    # train
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    print(f"Fold {fold} Classification Report:")
    print(classification_report(y_test, preds))
    auc = roc_auc_score(y_test, probs)
    print(f"Fold {fold} ROC AUC: {auc:.4f}\n")


Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.82      0.89      5422
           1       0.27      0.70      0.39       510

    accuracy                           0.81      5932
   macro avg       0.62      0.76      0.64      5932
weighted avg       0.91      0.81      0.84      5932

Fold 1 ROC AUC: 0.8394

Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5196
           1       0.37      0.24      0.29       736

    accuracy                           0.86      5932
   macro avg       0.64      0.59      0.60      5932
weighted avg       0.83      0.86      0.84      5932

Fold 2 ROC AUC: 0.7285

Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      5519
           1       0.35      0.23      0.27       413

    accuracy                           0.92 

In [35]:
# 1️⃣ Compute hot probabilities
merged['hot_probability'] = model.predict_proba(X)[:, 1]

# 2️⃣ Optional: remove duplicate symbols per snapshot if needed
signals = (
    merged.sort_values(['snapshot_time', 'symbol', 'hot_probability'], ascending=[True, True, False])
    .drop_duplicates(subset=['snapshot_time', 'symbol'], keep='first')
)

# 3️⃣ Keep only relevant columns for inspection
all_signals = signals[['snapshot_time', 'symbol', 'hot_probability']]

# 4️⃣ Show all (or first 50 rows for sanity)
all_signals.head(50)


Unnamed: 0,snapshot_time,symbol,hot_probability
0,2025-11-26 20:46:26,AA,0.001505
1,2025-11-26 20:46:26,AAUC,0.003092
2,2025-11-26 20:46:26,ALAB,0.109779
3,2025-11-26 20:46:26,ANF,0.386333
4,2025-11-26 20:46:26,ARWR,0.964688
5,2025-11-26 20:46:26,ASML,0.00384
6,2025-11-26 20:46:26,ATGE,0.045056
7,2025-11-26 20:46:26,ATMU,0.794815
8,2025-11-26 20:46:26,BBAR,0.001214
9,2025-11-26 20:46:26,BE,0.009931


In [None]:
import plotly.express as px

# Ensure data is clean
all_signals['snapshot_time'] = pd.to_datetime(all_signals['snapshot_time'])

# Optional: pick top 50 symbols by max probability for readability
top_symbols = (
    all_signals.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(50)
    .index
)

df_chart = all_signals[all_signals['symbol'].isin(top_symbols)]

# Create pivot for heatmap: symbols vs snapshots
heatmap_df = df_chart.pivot(index='symbol', columns='snapshot_time', values='hot_probability')

# Plot interactive heatmap
fig = px.imshow(
    heatmap_df,
    labels=dict(x="Snapshot Time", y="Symbol", color="Hot Probability"),
    aspect="auto",
    color_continuous_scale="YlOrRd",
    text_auto=True
)

fig.update_layout(
    title="AI Hot-Stock Probability Heatmap",
    xaxis_nticks=20,
    yaxis={'categoryorder':'total ascending'}
)


chart_path = os.path.join(OUTPUT_DIR, f"superchart_interactive-1.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
import plotly.graph_objects as go

# Ensure snapshot_time is datetime
all_signals['snapshot_time'] = pd.to_datetime(all_signals['snapshot_time'])

# Select top 20 symbols by max probability for clarity
top_symbols = (
    all_signals.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(20)
    .index
)

df_chart = all_signals[all_signals['symbol'].isin(top_symbols)]

# Create figure
fig = go.Figure()

# Add a line for each symbol
for symbol in top_symbols:
    df_sym = df_chart[df_chart['symbol'] == symbol].sort_values('snapshot_time')
    fig.add_trace(go.Scatter(
        x=df_sym['snapshot_time'],
        y=df_sym['hot_probability'],
        mode='lines+markers',
        name=symbol,
        line=dict(width=2),
        marker=dict(size=6),
        hovertemplate='Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
        text=[symbol]*len(df_sym)
    ))

# Highlight top-N per snapshot (e.g., top 3)
top_n = 3
top_per_snapshot = (
    df_chart.sort_values(['snapshot_time', 'hot_probability'], ascending=[True, False])
            .groupby('snapshot_time')
            .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_per_snapshot['snapshot_time'],
    y=top_per_snapshot['hot_probability'],
    mode='markers',
    marker=dict(size=10, color='red', symbol='star'),
    name='Top Hot',
    hovertemplate='Top Hot Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
    text=top_per_snapshot['symbol']
))

# Layout
fig.update_layout(
    title="Superchart 2.0 — AI Hot-Stock Probabilities Over Time",
    xaxis_title="Snapshot Time",
    yaxis_title="Hot Probability",
    yaxis=dict(range=[0, 1]),
    hovermode='closest',
    legend_title="Symbols",
    template="plotly_dark"
)


chart_path = os.path.join(OUTPUT_DIR, f"superchart_interactive-2.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
import plotly.graph_objects as go 
import numpy as np 
import plotly.io as pio 


# Ensure datetime
all_signals['snapshot_time'] = pd.to_datetime(all_signals['snapshot_time'])

# Select top 20 symbols by max probability for clarity
top_symbols = (
    all_signals.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(20)
    .index
)

df_chart = all_signals[all_signals['symbol'].isin(top_symbols)]

# Compute top 10% threshold per snapshot
thresholds = df_chart.groupby('snapshot_time')['hot_probability'].quantile(0.90)
df_chart = df_chart.merge(thresholds.rename("top10_threshold"), on='snapshot_time')

# Create figure
fig = go.Figure()

# Add a shaded area for top 10% probabilities
snapshot_times = sorted(df_chart['snapshot_time'].unique())
threshold_values = [thresholds[st] for st in snapshot_times]

fig.add_trace(go.Scatter(
    x=snapshot_times + snapshot_times[::-1],  # for filled area
    y=threshold_values + [0]*len(threshold_values),
    fill='toself',
    fillcolor='rgba(255, 165, 0, 0.2)',  # light orange
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo='skip',
    showlegend=True,
    name='Top 10% Threshold'
))

# Add lines for each symbol
for symbol in top_symbols:
    df_sym = df_chart[df_chart['symbol'] == symbol].sort_values('snapshot_time')
    fig.add_trace(go.Scatter(
        x=df_sym['snapshot_time'],
        y=df_sym['hot_probability'],
        mode='lines+markers',
        name=symbol,
        line=dict(width=2),
        marker=dict(size=6),
        hovertemplate='Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
        text=[symbol]*len(df_sym)
    ))

# Highlight top 3 per snapshot
top_n = 3
top_per_snapshot = (
    df_chart.sort_values(['snapshot_time', 'hot_probability'], ascending=[True, False])
            .groupby('snapshot_time')
            .head(top_n)
)

fig.add_trace(go.Scatter(
    x=top_per_snapshot['snapshot_time'],
    y=top_per_snapshot['hot_probability'],
    mode='markers',
    marker=dict(size=10, color='red', symbol='star'),
    name='Top Hot',
    hovertemplate='Top Hot Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
    text=top_per_snapshot['symbol']
))

# Layout
fig.update_layout(
    title="Superchart 2.1 — AI Hot-Stock Probabilities with Top 10% Overlay",
    xaxis_title="Snapshot Time",
    yaxis_title="Hot Probability",
    yaxis=dict(range=[0, 1]),
    hovermode='closest',
    legend_title="Symbols",
    template="plotly_dark"
)

chart_path = os.path.join(OUTPUT_DIR, f"superchart_interactive-3.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')


✅ Superchart 2.1 exported:
 - Interactive HTML: superchart2_interactive.html
 - Static PNG: superchart2_static.png




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

# Copy data
df_chart = all_signals.copy()
df_chart['snapshot_time'] = pd.to_datetime(df_chart['snapshot_time'])

# Top 15 symbols
top_symbols = (
    df_chart.groupby('symbol')['hot_probability']
    .max()
    .sort_values(ascending=False)
    .head(15)
    .index
)
df_chart = df_chart[df_chart['symbol'].isin(top_symbols)]

# Apply rolling average for smoothing
df_chart['hot_probability_smooth'] = df_chart.groupby('symbol')['hot_probability'].transform(lambda x: x.rolling(3, min_periods=1).mean())

# Prepare color palette for symbols
cmap = cm.get_cmap('plasma', len(top_symbols))
symbol_colors = {sym: cmap(i) for i, sym in enumerate(top_symbols)}

# Pivot for stacked area
pivot_df = df_chart.pivot(index='snapshot_time', columns='symbol', values='hot_probability_smooth').fillna(0)
pivot_df = pivot_df[top_symbols]  # ensure order

# Normalize probabilities for gradient alpha
norm = colors.Normalize(vmin=df_chart['hot_probability_smooth'].min(), vmax=df_chart['hot_probability_smooth'].max())

# Create figure
fig = go.Figure()

# Stacked area with gradient effect via rgba alpha
for sym in top_symbols:
    stream_values = pivot_df[sym].values
    alpha_values = 0.2 + 0.6 * norm(stream_values)  # min alpha 0.2, max 0.8
    # Plot line per symbol
    fig.add_trace(go.Scatter(
        x=pivot_df.index,
        y=stream_values,
        mode='lines',
        stackgroup='one',
        line=dict(width=1.5, color='rgba({},{},{},{})'.format(
            int(symbol_colors[sym][0]*255),
            int(symbol_colors[sym][1]*255),
            int(symbol_colors[sym][2]*255),
            0.5  # base alpha for stacked area
        )),
        fillcolor='rgba({},{},{},{})'.format(
            int(symbol_colors[sym][0]*255),
            int(symbol_colors[sym][1]*255),
            int(symbol_colors[sym][2]*255),
            0.4  # slightly brighter
        ),
        name=sym,
        hovertemplate='Symbol: %{fullData.name}<br>Time: %{x}<br>Prob: %{y:.3f}'
    ))

# Highlight top 3 per snapshot with “glow” (larger yellow stars)
top_n = 3
top_per_snapshot = (
    df_chart.sort_values(['snapshot_time','hot_probability_smooth'], ascending=[True,False])
            .groupby('snapshot_time')
            .head(top_n)
)
fig.add_trace(go.Scatter(
    x=top_per_snapshot['snapshot_time'],
    y=top_per_snapshot['hot_probability_smooth'],
    mode='markers',
    marker=dict(size=14, color='yellow', symbol='star', line=dict(width=2, color='orange')),
    name='Top Hot',
    hovertemplate='Top Hot Symbol: %{text}<br>Time: %{x}<br>Prob: %{y:.3f}',
    text=top_per_snapshot['symbol']
))

# Layout
fig.update_layout(
    title="Superchart 4.1 — Smooth Gradient Streams with Glow Effect",
    xaxis_title="Snapshot Time",
    yaxis_title="Hot Probability",
    yaxis=dict(range=[0,1]),
    hovermode='closest',
    legend_title="Symbols",
    template="plotly_dark"
)


chart_path = os.path.join(OUTPUT_DIR, f"superchart_interactive-4.0.html")
fig.write_html(chart_path, include_plotlyjs='cdn')



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



In [None]:
importance = pd.Series(
model.feature_importances_,
index=features
).sort_values(ascending=False)

importance

VolatilityScore               0.210050
regularMarketPrice            0.163759
MomentumScore                 0.149428
regularMarketChangePercent    0.127269
VolumeScore                   0.122236
TrendScore                    0.121782
VolumeSpike                   0.105476
dtype: float32