In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt


DATA_HOTSCORE = Path("data/hotscore")
ML_DIR = Path("ml")


for p in (DATA_HOTSCORE, ML_DIR):
    p.mkdir(parents=True, exist_ok=True)


pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

In [2]:
files = sorted(DATA_HOTSCORE.glob("*.csv"))
print(f"Found {len(files)} daily files")

dfs = []
for f in files:
    df = pd.read_csv(f)
    df["source_file"] = f.name 
    dfs.append(df)

full_df = pd.concat(dfs, ignore_index=True)
full_df.head()


Found 20 daily files


Unnamed: 0,symbol,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,source_file
0,AA,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260117.csv
1,AAUC,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0,hotscore_20260117.csv
2,ALAB,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0,hotscore_20260117.csv
3,ANF,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0,hotscore_20260117.csv
4,ARWR,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0,hotscore_20260117.csv


In [4]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576014 entries, 0 to 576013
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   symbol                      576014 non-null  object 
 1   date                        576014 non-null  object 
 2   HotScore                    573626 non-null  float64
 3   TrendScore                  573626 non-null  float64
 4   regularMarketPrice          573626 non-null  float64
 5   regularMarketChangePercent  573626 non-null  float64
 6   VolumeSpike                 573626 non-null  float64
 7   averageDailyVolume3Month    573626 non-null  float64
 8   MomentumScore               573626 non-null  float64
 9   VolumeScore                 573626 non-null  float64
 10  VolatilityScore             573626 non-null  float64
 11  marketCap                   573626 non-null  float64
 12  source_file                 576014 non-null  object 
dtypes: float64(10)

In [3]:

full_df.isna().sum()

symbol                           0
HotScore                      2388
TrendScore                    2388
regularMarketPrice            2388
regularMarketChangePercent    2388
VolumeSpike                   2388
averageDailyVolume3Month      2388
MomentumScore                 2388
VolumeScore                   2388
VolatilityScore               2388
marketCap                     2388
source_file                      0
dtype: int64

In [4]:
full_df.isna().mean().sort_values(ascending=False)

HotScore                      0.003414
TrendScore                    0.003414
regularMarketPrice            0.003414
regularMarketChangePercent    0.003414
VolumeScore                   0.003414
VolumeSpike                   0.003414
averageDailyVolume3Month      0.003414
MomentumScore                 0.003414
marketCap                     0.003414
VolatilityScore               0.003414
symbol                        0.000000
source_file                   0.000000
dtype: float64

In [5]:
NUMERIC_COLS = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "regularMarketVolume",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore",
    "HotScore"
]

for col in NUMERIC_COLS:
    if col in full_df.columns:
        full_df[col] = pd.to_numeric(full_df[col], errors="coerce")


In [6]:
full_df = full_df.dropna(subset=[
    "regularMarketPrice",
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore"
])

print("Rows after cleaning:", len(full_df))
full_df.head()

Rows after cleaning: 697126


Unnamed: 0,symbol,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,source_file
0,AA,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260117.csv
1,AAUC,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0,hotscore_20260117.csv
2,ALAB,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0,hotscore_20260117.csv
3,ANF,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0,hotscore_20260117.csv
4,ARWR,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0,hotscore_20260117.csv


In [9]:
full_df = full_df.drop_duplicates(subset=["symbol", "HotScore"], keep="last")
full_df = full_df.sort_values(["symbol", "HotScore"]).reset_index(drop=True)


print("Rows after dedup:", len(full_df))


Rows after dedup: 36294


In [11]:
FEATURES = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore",
    "VolumeSpike",
    "marketCap"
]

features_df = full_df[
    ["symbol"] + FEATURES + ["regularMarketPrice"]
].copy()

features_df.head()


Unnamed: 0,symbol,HotScore,MomentumScore,VolumeScore,VolatilityScore,TrendScore,VolumeSpike,marketCap,regularMarketPrice
0,AA,0.552353,0.670588,0.470588,0.541176,0.447059,0.119812,11187250000.0,43.2
1,AA,0.590458,0.389313,0.763359,0.664122,0.541985,0.216348,13784650000.0,53.23
2,AA,0.612044,0.635036,0.59854,0.620438,0.562044,0.143758,13016830000.0,50.265
3,AA,0.624837,0.751634,0.555556,0.601307,0.470588,0.21747,12083260000.0,46.66
4,AA,0.645033,0.788079,0.589404,0.602649,0.423841,0.238094,11257430000.0,43.471


In [12]:
base_file = ML_DIR / "features_base.csv"
features_df.to_csv(base_file)

print("Saved ML base dataset:", base_file)


Saved ML base dataset: ml\features_base.csv
