## json fixer


In [77]:
import pandas as pd
import io
import re
import json

# Load the JSON file
with open("data/processed_data/87e4de2f.json", 'r') as f:
    data = json.load(f)

# Extract CSV string and clean markdown formatting
csv_str = data['csv'].replace('```\n', '').replace('```', '').strip()

# Split into lines
lines = csv_str.split('\n')

# Process each line to fix unquoted ranges
fixed_lines = []
for line in lines:
    # Split by comma
    parts = line.split(', ')
    
    fixed_parts = []
    i = 0
    while i < len(parts):
        part = parts[i]
        
        # Check if this looks like the start of a range (number followed by another number)
        # Pattern: a number/negative number followed by a space and another number
        if i + 1 < len(parts) and re.match(r'^-?\d+\.?\d*\s*-?$', part):
            # This might be incomplete - check next part
            next_part = parts[i + 1]
            if re.match(r'^-?\d+\.?\d*', next_part):
                # Combine them and quote
                combined = f'"{part} {next_part}"'
                fixed_parts.append(combined)
                i += 2
                continue
        
        # Check if part contains spaces (unquoted range)
        if ' ' in part.strip() and part.strip() and not part.startswith('"'):
            # Quote it
            fixed_parts.append(f'"{part.strip()}"')
        else:
            fixed_parts.append(part.strip())
        
        i += 1
    
    fixed_lines.append(', '.join(fixed_parts))

# Join back into CSV string
fixed_csv = '\n'.join(fixed_lines)

# Try parsing with pandas
try:
    df = pd.read_csv(io.StringIO(fixed_csv), dtype=str)
    print("✓ Successfully parsed CSV!")
    print(f"\nShape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Save the fixed CSV
    df.to_csv("data/processed_data/fbb172b7_fixed.csv", index=False)
    print("\n✓ Saved fixed CSV to: data/processed_data/fbb172b7_fixed.csv")
    
except Exception as e:
    print(f"✗ Error: {e}")
    print("\n--- Fixed CSV content (first 500 chars) ---")
    print(fixed_csv[:500])
    
    # Alternative: Manual parsing with error tolerance
    print("\n\nTrying alternative parsing method...")
    
    # Parse header
    header = fixed_lines[0].split(', ')
    print(f"Header has {len(header)} columns")
    
    # Parse data rows
    data_rows = []
    for i, line in enumerate(fixed_lines[1:], start=2):
        parts = line.split(', ')
        print(f"Line {i}: {len(parts)} fields")
        if len(parts) != len(header):
            print(f"  Warning: Expected {len(header)}, got {len(parts)}")
            print(f"  Content: {line[:100]}...")
        data_rows.append(parts)
    
    # Create dataframe manually
    df = pd.DataFrame(data_rows, columns=header)
    print(f"\n✓ Manually created DataFrame")
    print(df.head())

✓ Successfully parsed CSV!

Shape: (11, 16)

Columns: ['Variable', ' "2020 Median"', ' "2021 Median"', ' "2022 Median"', ' "2023 Median"', ' "Longer run Median"', ' "2020 Central Tendency"', ' "2021 Central Tendency"', ' "2022 Central Tendency"', ' "2023 Central Tendency"', ' "Longer run Central Tendency"', ' "2020 Range"', ' "2021 Range"', ' "2022 Range"', ' "2023 Range"', ' "Longer run Range"']

First few rows:
             Variable  "2020 Median"  "2021 Median"     "2022 Median"  \
0  Change in real GDP     "-3.7 4.0"      "3.0 2.5"   "1.9 -4.0 -3.0"   
1     June projection     "-6.5 5.0"      "3.5 1.8"   "1.8 -7.6 -5.5"   
2   Unemployment rate      "7.6 5.5"      "4.6 4.0"    "4.1 7.0 -8.0"   
3     June projection      "9.3 6.5"      "5.5 4.1"   "4.1 9.0 -10.0"   
4       PCE inflation      "1.2 1.7"      "1.8 2.0"    "2.0 1.1 -1.3"   

   "2023 Median"  "Longer run Median"  "2020 Central Tendency"  \
0     "3.6 -4.7"           "2.5 -3.3"               "2.4 -3.0"   
1     "4.5 -

In [16]:
import requests

url = "https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20250319.htm"

# 1. Define headers to look like a browser (Chrome)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

try:
    # 2. Make the request
    response = requests.get(url, headers=headers)
    
    # 3. Check for errors (200 = OK, 404 = Not Found, 403 = Forbidden)
    response.raise_for_status() 
    
    # 4. Get the HTML
    html_content = response.text
    print("Download successful!")

except requests.exceptions.RequestException as e:
    print(f"Error downloading page: {e}")

Download successful!


In [19]:
import pandas as pd 

html_table = pd.read_html(html_content)
print('len tables:', len(html_table))
# for i , table in enumerate (html_table):
    # print(f"Table {i}:")
    # print(table.head())  # Print first 5 rows of each table
print(html_table[0].to_markdown(index=False))

len tables: 26
| ('Variable', 'Variable')                | ('Median1', '2025')                     | ('Median1', '2026')                     | ('Median1', '2027')                     | ('Median1', 'Longer run')               | ('Central Tendency2', '2025')           | ('Central Tendency2', '2026')           | ('Central Tendency2', '2027')           | ('Central Tendency2', 'Longer run')     | ('Range3', '2025')                      | ('Range3', '2026')                      | ('Range3', '2027')                      | ('Range3', 'Longer run')                |
|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:----------------------------------------|:---------------

  html_table = pd.read_html(html_content)


In [27]:
import pandas as pd
import requests

url = "https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20250319.htm"

# fetch with headers to avoid blocking
html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text
html.encode('utf-8') # Fix encoding issues
# FIX: Tell pandas the header is distributed across the first 2 rows (0 and 1)
dfs = pd.read_html(html, header=[0, 1]) 

# Inspect the first table
df = dfs[0]
print(df.head())

              Variable Median1                      Central Tendency2  \
              Variable    2025 2026 2027 Longer run              2025   
0   Change in real GDP     1.7  1.8  1.8        1.8         1.5â1.9   
1  December projection     2.1  2.0  1.9        1.8         1.8â2.2   
2    Unemployment rate     4.4  4.3  4.3        4.2         4.3â4.4   
3  December projection     4.3  4.3  4.3        4.2         4.2â4.5   
4        PCE inflation     2.7  2.2  2.0        2.0         2.6â2.9   

                                       Range3                                   
        2026       2027 Longer run       2025       2026       2027 Longer run  
0  1.6â1.9  1.6â2.0  1.7â2.0  1.0â2.4  0.6â2.5  0.6â2.5  1.5â2.5  
1  1.9â2.1  1.8â2.0  1.7â2.0  1.6â2.5  1.4â2.5  1.5â2.5  1.7â2.5  
2  4.2â4.5  4.1â4.4  3.9â4.3  4.1â4.6  4.1â4.7  3.9â4.7  3.5â4.5  
3  4.1â4.4  4.0â4.4  3.9â4.3  4.2â4.5  3.9â4.6  3.8â4.5  3.5â4.5  
4 

  dfs = pd.read_html(html, header=[0, 1])
