# Data Integration: Inflation & Consumer Spending

1. Gets CPI data from FRED API
2. Gets PCE data from FRED website
3. Merges them together
4. Creates useful new variables (like real spending)
5. Checks data quality
6. Saves everything with metadata

**Time period:** January 2015 to December 2024

## Setup

In [1]:
# Import libraries
from pathlib import Path
from datetime import datetime, timezone
import json
import hashlib
import io
import re

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
# Set up folders
ROOT = Path.cwd()
RAW = ROOT / "data" / "raw"
PROCESSED = ROOT / "data" / "processed"
# Date range
START = "2015-01-01"
END = "2024-12-31"


## Helper Functions

In [3]:
def sha256_checksum(filepath):
    #Calculate SHA-256 checksum of a file
    h = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def save_metadata(csv_path, info):
    #Save metadata JSON file next to the CSV
    info['file_size_bytes'] = csv_path.stat().st_size
    info['sha256'] = sha256_checksum(csv_path)
    info['retrieved_at_utc'] = datetime.now(timezone.utc).isoformat()
    
    json_path = csv_path.with_name(csv_path.stem + '_metadata.json')
    json_path.write_text(json.dumps(info, indent=2))
    return json_path

def create_session():
    #Create HTTP session with retry logic
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retry))
    return session


## Load API Key

In [4]:
# Read API key from file
api_key_file = ROOT / "fred_api_key.txt"
api_key = api_key_file.read_text(encoding='utf-8-sig').strip()

# Clean it up (remove quotes, spaces, etc.)
api_key = re.sub(r'\s+', '', api_key.strip('"').strip("'"))

## 1. Get CPI Data (from API)

In [5]:
# API request
url = "https://api.stlouisfed.org/fred/series/observations"
params = {
    'series_id': 'CPIAUCSL',
    'api_key': api_key,
    'file_type': 'json',
    'observation_start': START,
    'observation_end': END,
}

session = create_session()
response = session.get(url, params=params, timeout=60)
response.raise_for_status()

# Parse data
data = response.json()['observations']
cpi_df = pd.DataFrame(data)[['date', 'value']]
cpi_df['date'] = pd.to_datetime(cpi_df['date'])
cpi_df['value'] = pd.to_numeric(cpi_df['value'])
cpi_df = cpi_df.rename(columns={'value': 'cpi'})

# Save
cpi_path = RAW / 'CPIAUCSL.csv'
cpi_df.to_csv(cpi_path, index=False)

# Save metadata
save_metadata(cpi_path, {
    'series_id': 'CPIAUCSL',
    'source': 'FRED API',
    'description': 'Consumer Price Index for All Urban Consumers',
    'row_count': len(cpi_df),
})

print(f"CPI data saved: {len(cpi_df)} rows")
cpi_df.head()

CPI data saved: 120 rows


Unnamed: 0,date,cpi
0,2015-01-01,234.747
1,2015-02-01,235.342
2,2015-03-01,235.976
3,2015-04-01,236.222
4,2015-05-01,237.001


## 2. Get PCE Data (from CSV)

In [6]:
# Try multiple download URLs
urls = [
    "https://fred.stlouisfed.org/series/PCE/downloaddata/PCE.csv",
    "https://fred.stlouisfed.org/graph/fredgraph.csv?id=PCE",
]

session = create_session()
for url in urls:
    try:
        response = session.get(url, timeout=60)
        response.raise_for_status()
        
        # Parse CSV
        pce_df = pd.read_csv(io.BytesIO(response.content))
        
        # Clean up column names
        pce_df.columns = [c.strip().lower() for c in pce_df.columns]
        
        # Rename columns to standard names
        if 'observation_date' in pce_df.columns:
            pce_df = pce_df.rename(columns={'observation_date': 'date'})
        if 'value' in pce_df.columns:
            pce_df = pce_df.rename(columns={'value': 'pce'})
        elif 'pce' not in pce_df.columns:
            # If there's only one other column, use it
            other_cols = [c for c in pce_df.columns if c != 'date']
            if len(other_cols) == 1:
                pce_df = pce_df.rename(columns={other_cols[0]: 'pce'})
        
        # Keep only what we need
        pce_df = pce_df[['date', 'pce']]
        pce_df['date'] = pd.to_datetime(pce_df['date'])
        pce_df['pce'] = pd.to_numeric(pce_df['pce'])
        
        # Filter to our date range
        pce_df = pce_df[(pce_df['date'] >= START) & (pce_df['date'] <= END)]
        
        print(f"Downloaded from: {url}")
        break
    except Exception as e:
        print(f"Failed: {url}")
        continue

# Save
pce_path = RAW / 'PCE.csv'
pce_df.to_csv(pce_path, index=False)

# Save metadata
save_metadata(pce_path, {
    'series_id': 'PCE',
    'source': 'FRED CSV Download',
    'description': 'Personal Consumption Expenditures',
    'row_count': len(pce_df),
})

pce_df.head()

Failed: https://fred.stlouisfed.org/series/PCE/downloaddata/PCE.csv
Downloaded from: https://fred.stlouisfed.org/graph/fredgraph.csv?id=PCE


Unnamed: 0,date,pce
672,2015-01-01,12066.7
673,2015-02-01,12116.6
674,2015-03-01,12176.1
675,2015-04-01,12209.1
676,2015-05-01,12275.4


## 3. Merge Datasets

In [7]:
# Merge on date
merged = pd.merge(cpi_df, pce_df, on='date', how='inner')
merged = merged.sort_values('date').reset_index(drop=True)

merged.head()

Unnamed: 0,date,cpi,pce
0,2015-01-01,234.747,12066.7
1,2015-02-01,235.342,12116.6
2,2015-03-01,235.976,12176.1
3,2015-04-01,236.222,12209.1
4,2015-05-01,237.001,12275.4


## 4. Create New Variables (Enrichment)

In [8]:
# 1. CPI Index (2015-01 = 100)
base_cpi = merged.loc[merged['date'] == '2015-01-01', 'cpi'].iloc[0]
merged['cpi_index_2015_01_100'] = (merged['cpi'] / base_cpi) * 100

# 2. Real PCE (inflation-adjusted)
merged['real_pce'] = merged['pce'] / (merged['cpi_index_2015_01_100'] / 100)

# 3. Year-over-year growth rates
merged['pce_yoy_pct'] = merged['pce'].pct_change(12) * 100
merged['real_pce_yoy_pct'] = merged['real_pce'].pct_change(12) * 100
merged['cpi_yoy_pct'] = merged['cpi'].pct_change(12) * 100

merged.tail()

Unnamed: 0,date,cpi,pce,cpi_index_2015_01_100,real_pce,pce_yoy_pct,real_pce_yoy_pct,cpi_yoy_pct
115,2024-08-01,314.131,20001.3,133.816833,14946.774343,5.52938,2.844206,2.610914
116,2024-09-01,314.851,20147.6,134.123546,15021.672655,5.772229,3.260378,2.432541
117,2024-10-01,315.564,20226.0,134.427277,15046.053485,5.843198,3.189772,2.571403
118,2024-11-01,316.449,20313.6,134.804279,15068.957902,5.916398,3.117612,2.714168
119,2024-12-01,317.603,20514.3,135.295872,15162.546897,6.382659,3.412279,2.872366


## 5. Quality Check

In [9]:
# Missing values
missing = merged.isna().sum()
for col, count in missing.items():
    if count > 0:
        print(f"{col}: {count} missing")
        if '_yoy_' in col:
            print(f"OK: First 12 months have no prior year")
    else:
        print(f"{col}: no missing values")

# Duplicates
duplicates = merged.duplicated('date').sum()
print(f"\n{'!' if duplicates == 0 else ' '} Duplicate dates: {duplicates}")

# Date coverage
print(f"\n Date range: {merged['date'].min().date()} to {merged['date'].max().date()}")

date: no missing values
cpi: no missing values
pce: no missing values
cpi_index_2015_01_100: no missing values
real_pce: no missing values
pce_yoy_pct: 12 missing
OK: First 12 months have no prior year
real_pce_yoy_pct: 12 missing
OK: First 12 months have no prior year
cpi_yoy_pct: 12 missing
OK: First 12 months have no prior year

! Duplicate dates: 0

 Date range: 2015-01-01 to 2024-12-01


## 6. Save Final Dataset

In [10]:
# Save processed data
output_path = PROCESSED / 'macro_monthly.csv'
merged.to_csv(output_path, index=False)

# Save metadata
save_metadata(output_path, {
    'description': 'Integrated CPI and PCE data with derived variables',
    'sources': ['CPIAUCSL (FRED API)', 'PCE (FRED CSV)'],
    'row_count': len(merged),
    'columns': list(merged.columns),
    'derived_variables': {
        'cpi_index_2015_01_100': 'CPI normalized to 2015-01 = 100',
        'real_pce': 'Inflation-adjusted PCE (2015 dollars)',
        'pce_yoy_pct': 'YoY % change in nominal PCE',
        'real_pce_yoy_pct': 'YoY % change in real PCE',
        'cpi_yoy_pct': 'YoY % change in CPI (inflation rate)'
    }
})

print(f"Final dataset saved: {output_path}")

Final dataset saved: /Users/shihaoyu/Desktop/IS477_project/data/processed/macro_monthly.csv
