In [25]:
import pandas as pd
import numpy as np
from fredapi import Fred
import yfinance as yf
import os

start_date = '2003-01-01'
end_date = '2025-05-16'
fred = Fred(api_key='paste fred api key here')

def fetch_eia_data(filename, start_date, end_date):
    df = pd.read_excel(filename, sheet_name=1, skiprows=2)
    
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
    
    df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    return df

In [26]:
DATA_DIR = "data"  # Local folder
os.makedirs(DATA_DIR, exist_ok=True)  # Create if missing

# Path to your local file
DATASET_EIA = os.path.join(DATA_DIR, "RNGWHHDd.xls")

# Fetch Natural Gas Spot Prices
natural_gas_prices = fetch_eia_data(DATASET_EIA, start_date, end_date)
df = pd.DataFrame(natural_gas_prices).rename(columns={'Henry Hub Natural Gas Spot Price (Dollars per Million Btu)': 'henry_hub_nat_gas_price'}).set_index('Date') 

df['cushing_crude_oil_price'] = yf.download('CL=F', start_date, end_date)['Close']

df['Momentum_5'] = df['cushing_crude_oil_price'].rolling(window=5).apply(lambda x: (np.diff(x) > 0).sum(), raw=True)
df['Momentum_10'] = df['cushing_crude_oil_price'].rolling(window=10).apply(lambda x: (np.diff(x) > 0).sum(), raw=True)
df['MA_5'] = df['cushing_crude_oil_price'].rolling(window=5).mean()
df['MA_10'] = df['cushing_crude_oil_price'].rolling(window=10).mean()

df['dow_jones_adj_close_price'] = yf.download('^DJI', start=start_date, end=end_date)['Close']
df['nasdaq_adj_close_price'] = fred.get_series('NASDAQCOM', observation_start=start_date, observation_end=end_date)
df['sp_adj_close_price'] = yf.download('^GSPC', start=start_date, end=end_date)['Close']

df['eur_to_usd_exchange_rate'] = yf.download('EURUSD=X', start=start_date, end=end_date)['Close']
df['usd_to_uk_exchange_rate'] = fred.get_series('DEXUSUK', observation_start=start_date, observation_end=end_date)
df['jpy_to_usd_exchange_rate'] = fred.get_series('DEXJPUS', observation_start=start_date, observation_end=end_date)

daily_index = pd.date_range(start=start_date, end=end_date, freq='D')
df['federal_funds_rate'] = fred.get_series('FEDFUNDS', observation_start=start_date, observation_end=end_date).resample('D').ffill().reindex(daily_index, method='ffill')
df['bank_prime_loan_rate'] = fred.get_series('DPRIME', observation_start=start_date, observation_end=end_date)
df['treasury_1_year_rate'] = fred.get_series('GS1', observation_start=start_date, observation_end=end_date).resample('D').ffill().reindex(daily_index, method='ffill')
df['treasury_10_year_rate'] = fred.get_series('GS10', observation_start=start_date, observation_end=end_date).resample('D').ffill().reindex(daily_index, method='ffill')

df['breakeven_inflation_5_year_rate'] = fred.get_series('T5YIE', observation_start=start_date, observation_end=end_date)
df['breakeven_inflation_10_year_rate'] = fred.get_series('T10YIE', observation_start=start_date, observation_end=end_date)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [27]:
# Applying natural logarithm to all columns except interest rates
for column in df.columns:
    if 'rate' not in column:
        df[column] = np.log(df[column])

# Handling any infinities or NaNs that arise from logarithmic transformation or empty data points
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.ffill(inplace=True) # Forward fill

df = df.iloc[10:]
# Export to CSV
df.to_csv(os.path.join(DATA_DIR, 'compiled_dataset.csv'), index=True)

print("Dataset compiled and saved to 'compiled_dataset.csv'.")

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Dataset compiled and saved to 'compiled_dataset.csv'.
