In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt


DATA_HOTSCORE = Path("data/hotscore")
ML_DIR = Path("ml")

ML_DIR.mkdir(exist_ok=True)

for p in (DATA_HOTSCORE, ML_DIR):
    p.mkdir(parents=True, exist_ok=True)


pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

In [2]:
files = sorted(DATA_HOTSCORE.glob("*.csv"))
print(f"Found {len(files)} daily files")

dfs = []
for f in files:
    df = pd.read_csv(f)
    df["source_file"] = f.name 
    dfs.append(df)

full_df = pd.concat(dfs, ignore_index=True)
full_df.head()


Found 12 daily files


Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,source_file
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260117.csv
1,AAUC,2025-11-26 20:46:26,0.846094,0.854167,19.18,9.788214,1.052893,342331.0,0.96875,0.848958,0.622396,2378320000.0,hotscore_20260117.csv
2,ALAB,2025-11-26 20:46:26,0.773307,0.723958,156.16,7.860205,0.638383,6266829.0,0.942708,0.518229,0.947917,26375990000.0,hotscore_20260117.csv
3,ANF,2025-11-26 20:46:26,0.905599,0.828125,96.205,6.610155,1.88349,2070173.0,0.898438,0.958333,0.864583,4583525000.0,hotscore_20260117.csv
4,ARWR,2025-11-26 20:46:26,0.95638,0.734375,58.675,25.400724,2.501594,2311350.0,1.0,0.976562,0.955729,8112262000.0,hotscore_20260117.csv


In [4]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576014 entries, 0 to 576013
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   symbol                      576014 non-null  object 
 1   date                        576014 non-null  object 
 2   HotScore                    573626 non-null  float64
 3   TrendScore                  573626 non-null  float64
 4   regularMarketPrice          573626 non-null  float64
 5   regularMarketChangePercent  573626 non-null  float64
 6   VolumeSpike                 573626 non-null  float64
 7   averageDailyVolume3Month    573626 non-null  float64
 8   MomentumScore               573626 non-null  float64
 9   VolumeScore                 573626 non-null  float64
 10  VolatilityScore             573626 non-null  float64
 11  marketCap                   573626 non-null  float64
 12  source_file                 576014 non-null  object 
dtypes: float64(10)

In [6]:

full_df.isna().sum()

symbol                           0
date                             0
HotScore                      2388
TrendScore                    2388
regularMarketPrice            2388
regularMarketChangePercent    2388
VolumeSpike                   2388
averageDailyVolume3Month      2388
MomentumScore                 2388
VolumeScore                   2388
VolatilityScore               2388
marketCap                     2388
source_file                      0
dtype: int64

In [7]:
full_df.isna().mean().sort_values(ascending=False)

HotScore                      0.004146
regularMarketPrice            0.004146
TrendScore                    0.004146
VolumeSpike                   0.004146
regularMarketChangePercent    0.004146
VolumeScore                   0.004146
VolatilityScore               0.004146
averageDailyVolume3Month      0.004146
MomentumScore                 0.004146
marketCap                     0.004146
symbol                        0.000000
date                          0.000000
source_file                   0.000000
dtype: float64

In [8]:
full_df["date"] = pd.to_datetime(full_df["date"], errors="coerce")

full_df = (
    full_df
    .dropna(subset=["symbol", "date"])
    .sort_values(["symbol", "date"])
    .reset_index(drop=True)
)

full_df[["symbol", "date"]].head(10)

Unnamed: 0,symbol,date
0,AA,2025-11-26 20:46:26
1,AA,2025-11-26 20:46:26
2,AA,2025-11-26 20:46:26
3,AA,2025-11-26 20:46:26
4,AA,2025-11-26 20:46:26
5,AA,2025-11-26 20:46:26
6,AA,2025-11-26 20:46:26
7,AA,2025-11-26 20:46:26
8,AA,2025-11-26 20:46:26
9,AA,2025-11-26 20:46:26


In [9]:
NUMERIC_COLS = [
    "regularMarketPrice",
    "regularMarketChangePercent",
    "regularMarketVolume",
    "averageDailyVolume3Month",
    "marketCap",
    "VolumeSpike",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore",
    "HotScore"
]

for col in NUMERIC_COLS:
    if col in full_df.columns:
        full_df[col] = pd.to_numeric(full_df[col], errors="coerce")


In [10]:
full_df = full_df.dropna(subset=[
    "regularMarketPrice",
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore"
])

print("Rows after cleaning:", len(full_df))
full_df.head()

Rows after cleaning: 573626


Unnamed: 0,symbol,date,HotScore,TrendScore,regularMarketPrice,regularMarketChangePercent,VolumeSpike,averageDailyVolume3Month,MomentumScore,VolumeScore,VolatilityScore,marketCap,source_file
0,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260117.csv
1,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260119.csv
2,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260120.csv
3,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260121.csv
4,AA,2025-11-26 20:46:26,0.794401,0.520833,41.845,6.74745,0.940394,6727448.0,0.903646,0.802083,0.726562,10836350000.0,hotscore_20260122.csv


In [11]:
full_df = full_df.drop_duplicates(
    subset=["symbol", "date"],
    keep="last"
)

print("Rows after dedup:", len(full_df))


Rows after dedup: 49873


In [12]:
FEATURES = [
    "HotScore",
    "MomentumScore",
    "VolumeScore",
    "VolatilityScore",
    "TrendScore",
    "VolumeSpike",
    "marketCap"
]

features_df = full_df[
    ["symbol", "date"] + FEATURES + ["regularMarketPrice"]
].copy()

features_df.head()


Unnamed: 0,symbol,date,HotScore,MomentumScore,VolumeScore,VolatilityScore,TrendScore,VolumeSpike,marketCap,regularMarketPrice
11,AA,2025-11-26 20:46:26,0.794401,0.903646,0.802083,0.726562,0.520833,0.940394,10836350000.0,41.845
23,AA,2025-11-26 20:46:27,0.794401,0.903646,0.802083,0.726562,0.520833,0.940394,10836350000.0,41.845
35,AA,2025-11-26 21:03:59,0.773989,0.876011,0.749326,0.746631,0.557951,1.039631,10762550000.0,41.56
47,AA,2025-11-26 21:17:41,0.789218,0.881402,0.778976,0.754717,0.571429,1.136707,10765130000.0,41.57
59,AA,2025-11-26 21:17:42,0.789218,0.881402,0.778976,0.754717,0.571429,1.136707,10765130000.0,41.57


In [13]:
base_file = ML_DIR / "features_base.csv"
features_df.to_csv(base_file, index=False)

print("Saved ML base dataset:", base_file)


Saved ML base dataset: ml\features_base.csv
