In [33]:
# graph_build_config.py

import json
from pathlib import Path
from datetime import datetime, date, timedelta
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from torch_geometric.data import HeteroData

DATA_DIR = Path("data")
SPEECH_FOLDER = DATA_DIR / "text_data/"
TOPIC_SCORE_FOLDER = DATA_DIR / "topic_scores/"
RATES_FILE = DATA_DIR / "price_data/2025-10-26 Fed Funds 12M 6M Historical Swap Rates.xlsx"

LOOKBACK_DAYS = 30   # rolling window length
TARGET_COLUMN = "fed_funds"   # or "asw"
PREDICT_DELTA = True         # True = predict Î”y, False = level
START_DATE = datetime(2018, 1, 1)

In [34]:
# date_utils.py

from datetime import datetime, date

def parse_date(dstr: str) -> date:
    """
    Try several common date formats and return a datetime.date.
    Adjust/add formats if your data differs.
    """
    dstr = dstr.strip()
    formats = [
        "%Y-%m-%d",       # 2023-08-25
        "%Y/%m/%d",       # 2023/08/25
        "%Y-%m-%dT%H:%M:%S",  # 2023-08-25T00:00:00
        "%B %d, %Y",      # August 25, 2023
        "%b %d, %Y",      # Aug 25, 2023
    ]
    for fmt in formats:
        try:
            return datetime.strptime(dstr, fmt)
        except ValueError:
            continue
    raise ValueError(f"Unrecognized date format: {dstr}")


In [55]:
import glob

def load_speeches(path=SPEECH_FOLDER):
    
    
    json_files = glob.glob(str(path) + "/*.json")

    speeches = {}
    
    for json_file in json_files:
        with open(json_file, "r", encoding="utf-8") as f:
            raw = json.load(f)

        for row in raw:
        
            sid = row["id"]
            date = parse_date(row["date"])
            if date < START_DATE:
                continue 
            
            speeches[sid] = {
                "author": json_file.split("/")[-1].split(".")[0],
                "text": row["text"],
                "date": parse_date(row["date"]),
            }
    return speeches

def oad_topic_scores(path=TOPIC_SCORE_FOLDER):
    
    json_files = glob.glob(str(path) + "/*.json")
    scores = {}
    
    for json_file in json_files: 
        with open(json_file, "r", encoding="utf-8") as f:
            raw = json.load(f)
            
        for row in raw:
            sid = row["id"]
        # adjust key "gpt-5" if your JSON differs
            scores[sid] = row["gpt-5"]
    return scores

def load_rates(path=RATES_FILE):
    df = pd.read_excel(path)
    df["Date"] = df["Date"].apply(lambda x: str(x).split(" ")[0])
    df["Date"] = df["Date"].apply(parse_date)
    df = df.set_index("Date").sort_index()
    df = df[["Rate"]]
    return df  # index: date, columns: fed_funds, asw, ...

In [57]:
rates = load_rates()
print(rates)
speeches = load_speeches()
print(len(speeches))
scores = oad_topic_scores()
print(len(scores))


                Rate
Date                
2018-06-04  2.141069
2018-06-05  2.156929
2018-06-06  2.150165
2018-06-07  2.169461
2018-06-08  2.159822
...              ...
2025-10-21  3.407672
2025-10-22  3.395943
2025-10-23  3.438809
2025-10-24  3.427455
2025-10-27  3.431500

[1841 rows x 1 columns]
1038
1176
