In [25]:
import os
import re
import glob
import pandas as pd
from bs4 import BeautifulSoup

RAW_DATA_DIR = "raw_data"
ANNOUNCEMENTS_DIR = os.path.join(RAW_DATA_DIR, "announcements")
INTERMEETING_DIR = os.path.join(RAW_DATA_DIR, "intermeeting")
PROCESSED_DIR = "processed_data"
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [27]:
def extract_text_from_html(file_path):
    """Extract text from HTML files."""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
        
        soup = BeautifulSoup(content, 'html.parser')
        
        for script in soup(["script", "style"]):
            script.extract()
        
        text = soup.get_text(separator=' ')
        
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return ""

def find_fed_statement_section(text):
    """Find the actual FED statement in the HTML page."""
    # common indicators of the start of the actual statement
    statement_indicators = [
        "For immediate release",
        "Information received since",
        "Recent indicators suggest",
        "The Federal Open Market Committee",
        "The Committee seeks"
    ]
    
    # earliest occurrence of any indicator
    start_indices = [text.find(indicator) for indicator in statement_indicators if indicator in text]
    if not start_indices:
        return text
    
    start_idx = min(idx for idx in start_indices if idx >= 0)
    
    # finding end of the statement
    end_indicators = [
        "Voting for the",
        "Implementation Note",
        "For media inquiries",
        "Last Update:"
    ]
    
    end_indices = [text.find(indicator) for indicator in end_indicators if indicator in text and text.find(indicator) > start_idx]
    if not end_indices:
        return text[start_idx:]
    
    end_idx = min(end_indices)
    return text[start_idx:end_idx]

def clean_text(text):
    """Clean the extracted text."""
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'[^\w\s.,;:!?()-]', '', text)
    
    text = re.sub(r'[.,;:!?](\s*[.,;:!?])+', '. ', text)
    
    return text.strip()

def simple_sentence_tokenize(text):
    """
    A simple regex-based sentence tokenizer.
    Splits text on common sentence terminators followed by whitespace and a capital letter.
    """
    if not text:
        return []

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    
    result = []
    for sentence in sentences:
        if len(sentence.split()) > 50: 
            sub_sentences = re.split(r'(?<=\.)\s+', sentence)
            result.extend(sub_sentences)
        else:
            result.append(sentence)
    
    return [s.strip() for s in result if s.strip()]

def identify_topic(sentence, topic_keywords):
    """
    Identify if a sentence belongs to a specific topic.
    Returns True if any primary keyword and context word pair is found within the window.
    """
    primary_keywords = topic_keywords['primary_keywords']
    context_words = topic_keywords['context_words']
    window = topic_keywords['window']
    
    sentence = sentence.lower()
    
    for primary in primary_keywords:
        if primary.lower() in sentence:
            words = sentence.split()
            for i, word in enumerate(words):
                if primary.lower() in word:
                    start = max(0, i - window)
                    end = min(len(words), i + window + 1)
                    window_text = ' '.join(words[start:end])
                    
                    for context in context_words:
                        if context.lower() in window_text:
                            return True
    
    return False

In [29]:
def process_files(directory, is_statement=True):
    """Process all HTML files in the given directory."""
    results = []
    
    monetary_policy_keywords = {
        'primary_keywords': [
            "federal funds rate", "policy rate", "interest rate", "monetary policy stance",
            "tightening", "easing", "accommodative policy", "restrictive policy", "basis points", 
            "target rate", "rate decision", "interest rate target", "tightening cycle",
            "policy tightening", "rate easing", "policy easing", "rate cut", "rate hike", "bps",
            "neutral stance", "normalization", "monetary tightening", "monetary easing",
            "policy shift", "rate setting", "policy adjustment", "real interest rate",
            "nominal interest rate", "terminal rate", "peak rate", "lower bound", "upper bound"],
        'context_words': [
            "increase", "decrease", "maintain", "adjust", "change", "vote", "decision", 
            "committee", "unanimous", "raise", "lower", "pause", "resume", "keep", "hold", "cut",
            "hike", "boost", "reduce", "accelerate", "slow", "voted", "majority", "dissent"],'window': 20}
    
    economic_conditions_keywords = {
        'primary_keywords': [
            "inflation", "employment", "unemployment", "economic activity", "growth",
            "GDP", "consumer spending", "business investment", "labor market", 
            "price stability", "economic outlook", "output", "real GDP", "nominal GDP", "industrial production", 
            "manufacturing output", "personal consumption", "retail sales", "job creation", "wage growth", 
            "jobless claims", "participation rate", "productivity", "economic indicators", "recession", "recovery",
            "economic strength", "economic weakness", "economic expansion", "economic contraction",
            "macroeconomic conditions", "price pressure", "cost of living", "core inflation",
            "headline inflation", "CPI", "PCE", "employment cost index", "labor force",],
        'context_words': [
            "increase", "decrease", "improve", "deteriorate", "strengthen", "weaken",
            "expand", "contract", "moderate", "elevated", "stable", "volatile", "cool", 
            "heat up", "surge", "plunge", "accelerate", "slow", "trend", "fluctuate",
            "persist", "wane", "remain strong", "remain weak", "pick up", "soften",
            "decline", "boost", "drag", "recover", "rebound", "stagnate"],'window': 25}
    
    forward_guidance_keywords = {
        'primary_keywords': [
            "future", "coming months", "coming meetings", "outlook", "path", "trajectory",
            "forward guidance", "anticipate", "expect", "foresee", "project", "projection", 
            "forecast", "estimate", "plan", "intention", "guidance", "signal",
            "likely", "expected path", "policy path", "ahead", "in the near term",
            "in the medium term", "in the long term", "forward-looking", "views", 
            "baseline scenario", "confidence", "uncertainty", "assumption", "time horizon"],
        'context_words': [
            "policy", "rate", "stance", "adjust", "accommodation", "tightening",
            "restrictive", "neutral", "appropriate", "Decision", "change", "maintain", "increase", 
            "decrease", "reassess", "monitor", "evaluate", "data dependent", "determine", "meeting", 
            "statement", "communication", "signal", "consider", "deliberate", "direction", "approach", 
            "commitment", "resolve", "probable", "projection error", "scenarios", "clarity"],'window': 30}
    
    balance_sheet_keywords = {
        'primary_keywords': [
            "balance sheet", "asset purchase", "securities", "Treasury securities", 
            "agency debt", "mortgage-backed securities", "MBS", "portfolio", "holdings",
            "reinvestment", "quantitative easing", "QE", "runoff", "securities holdings", "bond purchases", 
            "liquidity operations", "Fed assets", "balance sheet expansion", "balance sheet reduction", "QT", 
            "quantitative tightening", "securities portfolio", "rolloff", "maturity schedule", "portfolio runoff",
            "policy normalization", "Fed balance sheet", "reserve balance", "longer-term securities", 
            "monetary base", "monetary aggregates", "balance sheet unwind", "balance sheet management", "reserve drainage"],
        'context_words': [
            "increase", "decrease", "maintain", "continue", "reduce", "expand",
            "cap", "taper", "normalize", "roll over", "maturity", "adjust", "hold steady", 
            "wind down", "phase out", "scale back", "limit", "reinvest", "cease", "initiate", 
            "accelerate", "slow", "resume", "halt", "redeem", "run off", "implement", "conduct", 
            "shift", "monitor", "evaluate", "transition", "framework", "absorb", "inject", "drain"],'window': 25}
    
    file_count = 0
    for file_path in glob.glob(os.path.join(directory, "*.html")):
        try:
            file_count += 1
            if file_count % 10 == 0:
                print(f"Processed {file_count} files...")
                
            file_name = os.path.basename(file_path)
            date_match = re.search(r'(\d{8})', file_name)
            if not date_match:
                continue
                
            date_str = date_match.group(1)
            date = pd.to_datetime(date_str, format='%Y%m%d')
            
            full_text = extract_text_from_html(file_path)
            
            if is_statement:
                text = find_fed_statement_section(full_text)
            else:
                text = full_text
            
            cleaned_text = clean_text(text)
            
            sentences = simple_sentence_tokenize(cleaned_text)

            # skipping
            if not sentences:
                print(f"No sentences found in {file_path}")
                continue
            
            monetary_policy_sentences = [s for s in sentences if identify_topic(s, monetary_policy_keywords)]
            economic_conditions_sentences = [s for s in sentences if identify_topic(s, economic_conditions_keywords)]
            forward_guidance_sentences = [s for s in sentences if identify_topic(s, forward_guidance_keywords)]
            balance_sheet_sentences = [s for s in sentences if identify_topic(s, balance_sheet_keywords)]
            
            result = {
                'date': date,
                'document_type': 'statement' if is_statement else 'intermeeting',
                'file_path': file_path,
                'full_text': cleaned_text,
                'monetary_policy_text': ' '.join(monetary_policy_sentences),
                'economic_conditions_text': ' '.join(economic_conditions_sentences),
                'forward_guidance_text': ' '.join(forward_guidance_sentences),
                'balance_sheet_text': ' '.join(balance_sheet_sentences),
                'num_sentences_total': len(sentences),
                'num_monetary_policy': len(monetary_policy_sentences),
                'num_economic_conditions': len(economic_conditions_sentences),
                'num_forward_guidance': len(forward_guidance_sentences),
                'num_balance_sheet': len(balance_sheet_sentences)
            }
            
            results.append(result)
            
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return pd.DataFrame(results)


In [31]:
def main():
    """Main function to process all files."""
    print("Processing FED statements!")
    statements_df = process_files(ANNOUNCEMENTS_DIR, is_statement=True)
    statements_df.to_csv(os.path.join(PROCESSED_DIR, "processed_statements.csv"), index=False)
    
    print("Processing intermeeting minutes!")
    intermeeting_df = process_files(INTERMEETING_DIR, is_statement=False)
    intermeeting_df.to_csv(os.path.join(PROCESSED_DIR, "processed_intermeeting.csv"), index=False)
    
    print("Combining datasets!")
    combined_df = pd.concat([statements_df, intermeeting_df], ignore_index=True)
    combined_df.sort_values('date', inplace=True)
    combined_df.to_csv(os.path.join(PROCESSED_DIR, "processed_fed_documents.csv"), index=False)
    
    print("Text processing complete!")
    print(f"Processed {len(statements_df)} statements and {len(intermeeting_df)} intermeeting minutes.")

if __name__ == "__main__":
    main()

Processing FED statements!
Processed 10 files...
Processed 20 files...
Processed 30 files...
Processed 40 files...
Processed 50 files...
Processed 60 files...
Processed 70 files...
Processed 80 files...
Processed 90 files...
Processed 100 files...
Processed 110 files...
Processed 120 files...
Processed 130 files...
Processed 140 files...
Processed 150 files...
Processed 160 files...
Processed 170 files...
Processed 180 files...
Processed 190 files...
Processed 200 files...
Processing intermeeting minutes!
Processed 10 files...
Processed 20 files...
Processed 30 files...
Processed 40 files...
Processed 50 files...
Processed 60 files...
Processed 70 files...
Processed 80 files...
Processed 90 files...
Processed 100 files...
Processed 110 files...
Processed 120 files...
Processed 130 files...
Processed 140 files...
Processed 150 files...
Processed 160 files...
Processed 170 files...
Processed 180 files...
Processed 190 files...
Processed 200 files...
Combining datasets!
Text processing co