In [49]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timezone
import os
from dotenv import load_dotenv

# Load environment variables and connect to DB
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL)

# Load primary data
technical_df = pd.read_sql("SELECT * FROM technical_features", engine, parse_dates=["date"])
sentiment_df = pd.read_sql("SELECT * FROM sentiment_data", engine, parse_dates=["date"])

# Optional data loads
try:
    macro_df = pd.read_sql("SELECT * FROM macro_data", engine, parse_dates=["date"])
except Exception as e:
    print(f"⚠️ Skipping macro_data: {e}")
    macro_df = pd.DataFrame()

try:
    fundamentals_df = pd.read_sql("SELECT * FROM fundamental_data", engine, parse_dates=["date"])
except Exception as e:
    print(f"⚠️ Skipping fundamental_data: {e}")
    fundamentals_df = pd.DataFrame()
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timezone
import os
from dotenv import load_dotenv

# Load .env
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL)

# Load technical and sentiment
technical_df = pd.read_sql("SELECT * FROM technical_features", engine, parse_dates=["date"])
sentiment_df = pd.read_sql("SELECT * FROM sentiment_data", engine, parse_dates=["date"])

# Optional macro + fundamentals
try:
    macro_df = pd.read_sql("SELECT * FROM macro_data", engine, parse_dates=["date"])
except Exception as e:
    print(f"⚠️ Skipping macro_data: {e}")
    macro_df = pd.DataFrame()

try:
    fundamentals_df = pd.read_sql("SELECT * FROM fundamental_data", engine, parse_dates=["date"])
except Exception as e:
    print(f"⚠️ Skipping fundamental_data: {e}")
    fundamentals_df = pd.DataFrame()

# Aggregate sentiment
sentiment_agg = sentiment_df.groupby(["ticker", "date"]).agg(
    sentiment_avg=("sentiment_score", "mean"),
    sentiment_std=("sentiment_score", "std"),
    sentiment_count=("sentiment_score", "count")
).reset_index()

# Merge technical + sentiment
merged = pd.merge(technical_df, sentiment_agg, on=["ticker", "date"], how="left")
merged = merged.sort_values(["ticker", "date"])

# 🔁 Merge most recent fundamental data per ticker
if not fundamentals_df.empty:
    fundamentals_df = fundamentals_df.sort_values(["symbol", "date"])
    if "ticker" not in fundamentals_df.columns:
        fundamentals_df.rename(columns={"symbol": "ticker"}, inplace=True)

    fundamentals_df.columns.name = None  # flatten any pivoted column names
    merged = pd.merge_asof(
        merged,
        fundamentals_df,
        by="ticker",
        on="date",
        direction="backward",
        allow_exact_matches=True
    )

# 🔁 Merge most recent macro data
if not macro_df.empty:
    macro_df = macro_df.sort_values("date")
    merged = pd.merge_asof(
        merged,
        macro_df,
        on="date",
        direction="backward",
        allow_exact_matches=True
    )

# Final metadata
merged["merged_at"] = datetime.now(timezone.utc)

# Save to DB
merged.to_sql("merged_features", engine, if_exists="replace", index=False)
print(f"✅ Merged features table created with {len(merged)} rows.")


✅ Merged features table created with 6643 rows.


In [51]:
# Reload merged data to compute labels
df = pd.read_sql("SELECT * FROM merged_features", engine, parse_dates=["date"])
df = df.sort_values(["ticker", "date"])

# ➕ Compute 5-day forward return
df["future_close"] = df.groupby("ticker")["close"].shift(-5)
df["target_return_5d"] = (df["future_close"] - df["close"]) / df["close"]
df.drop(columns=["future_close"], inplace=True)

In [52]:
df

Unnamed: 0,ticker,date,close,volume,return_1d,sma_5,sma_20,ema_10,rsi_14,macd,...,inventoryTurnover,roe,capexPerShare,fetched_at_x,interest_rate,cpi,unemployment,fetched_at_y,merged_at,target_return_5d
0,NVDA,1999-01-22,0.041016,2714688000,,,,0.041016,,0.000000,...,,,,NaT,4.63,164.300,4.3,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,-0.034938
1,NVDA,1999-01-25,0.045313,510480000,0.104764,,,0.041797,,0.000343,...,,,,NaT,4.63,164.300,4.3,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,-0.109196
2,NVDA,1999-01-26,0.041797,343200000,-0.077594,,,0.041797,,0.000327,...,,,,NaT,4.63,164.300,4.3,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,-0.109027
3,NVDA,1999-01-27,0.041667,244368000,-0.003110,,,0.041774,,0.000300,...,,,,NaT,4.63,164.300,4.3,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,-0.087503
4,NVDA,1999-01-28,0.041536,227520000,-0.003144,0.042266,,0.041730,,0.000266,...,,,,NaT,4.63,164.300,4.3,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,-0.034476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6638,NVDA,2025-06-12,145.000000,162365000,0.015193,143.228003,137.869002,141.212175,69.711685,6.107597,...,3.237996,0.918729,0.131786,2025-06-20 05:38:22.167635,4.33,321.465,4.2,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,
6639,NVDA,2025-06-13,141.970001,180820600,-0.020897,143.278003,138.226002,141.349962,66.501866,5.784155,...,3.237996,0.918729,0.131786,2025-06-20 05:38:22.167635,4.33,321.465,4.2,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,
6640,NVDA,2025-06-16,144.690002,183133700,0.019159,143.690002,138.690502,141.957242,64.885008,5.681810,...,3.237996,0.918729,0.131786,2025-06-20 05:38:22.167635,4.33,321.465,4.2,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,
6641,NVDA,2025-06-17,144.119995,139108000,-0.003940,143.722000,139.118002,142.350470,65.138207,5.491405,...,3.237996,0.918729,0.131786,2025-06-20 05:38:22.167635,4.33,321.465,4.2,2025-06-20 05:38:26.502628,2025-06-20 10:39:04.918889+00:00,


In [50]:

# 🏷️ Encode trading signals
def encode_signal(r):
    if r > 0.015:
        return "BUY"
    elif r < -0.015:
        return "SELL"
    else:
        return "HOLD"

df["trade_signal"] = df["target_return_5d"].apply(encode_signal)

# Save to final_features
df.to_sql("final_features", engine, if_exists="replace", index=False)
print(f"✅ target_return_5d and trade_signal saved to final_features ({len(df)} rows)")


✅ target_return_5d and trade_signal saved to final_features (6643 rows)
