# Gold Price Forecasting with Geopolitical Risk
## Time-Series Forecasting using XGBoost

This notebook builds a forecasting model for next-day gold prices using:
- Historical gold and silver prices
- Geopolitical risk indices (GPRD, GPRD_ACT, GPRD_THREAT)
- Engineered features: lags, rolling statistics, calendar features

## 1. Imports & Path Setup

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import pickle

sns.set_style("whitegrid")

BASE_DIR = Path("..").resolve()
DATA_RAW_DIR = BASE_DIR / "data" / "raw"
OUTPUTS_PLOTS_DIR = BASE_DIR / "outputs" / "plots"
OUTPUTS_FORECASTS_DIR = BASE_DIR / "outputs" / "forecasts"
MODELS_DIR = BASE_DIR / "models"

OUTPUTS_PLOTS_DIR.mkdir(parents=True, exist_ok=True)
OUTPUTS_FORECASTS_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_RAW_DIR}")

## Multi-Metal Data Collection

Fetch all 8 metals from FRED (Gold, Silver, Platinum, Palladium) and MetalPriceAPI (Copper, Aluminum, Nickel, Zinc).

In [None]:
# ==============================================================================
# MULTI-METAL DATA COLLECTION
# ==============================================================================

import requests
from fredapi import Fred
from datetime import datetime, timedelta
import time

# API Keys
FRED_API_KEY = 'your_fred_api_key_here'  # Get from: https://fred.stlouisfed.org/docs/api/api_key.html
METALPRICEAPI_KEY = 'd4f1bd88c9f54564527ab0779e1157ef'

# Initialize FRED
fred = Fred(api_key=FRED_API_KEY)

# ==============================================================================
# METAL CONFIGURATION
# ==============================================================================

METAL_CONFIG = {
    # Precious metals from FRED (full historical data 2015-2026)
    'GOLD': {'source': 'fred', 'series_id': 'GOLDAMGBD228NLBM'},
    'SILVER': {'source': 'fred', 'series_id': 'SILVERPRICE'},
    'PLATINUM': {'source': 'fred', 'series_id': 'DPLATINUMUSD'},
    'PALLADIUM': {'source': 'fred', 'series_id': 'DPALLADIUMUSD'},

    # Industrial metals from MetalPriceAPI (2025-06-19 onwards)
    'COPPER': {'source': 'metalpriceapi', 'symbol': 'XCU'},
    'ALUMINUM': {'source': 'metalpriceapi', 'symbol': 'ALU'},
    'NICKEL': {'source': 'metalpriceapi', 'symbol': 'NI'},
    'ZINC': {'source': 'metalpriceapi', 'symbol': 'ZNC'}
}

# ==============================================================================
# DATA FETCHING FUNCTIONS
# ==============================================================================

def fetch_metalpriceapi_timeframe(symbols, start_date, end_date, api_key):
    """
    Fetch historical metal prices from MetalPriceAPI.
    Max 365 days per request.
    """
    base_url = 'https://api.metalpriceapi.com/v1/timeframe'

    params = {
        'api_key': api_key,
        'start_date': start_date,
        'end_date': end_date,
        'base': 'USD',
        'currencies': ','.join(symbols)
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        if data.get('success'):
            rates_dict = data['rates']
            df = pd.DataFrame.from_dict(rates_dict, orient='index')
            df.index = pd.to_datetime(df.index)
            df.sort_index(inplace=True)

            # Convert from rate to price (invert)
            df = 1 / df

            return df
        else:
            error_msg = data.get('error', {}).get('info', 'Unknown error')
            print(f"API Error: {error_msg}")
            return pd.DataFrame()

    except Exception as e:
        print(f"Error fetching MetalPriceAPI data: {e}")
        return pd.DataFrame()


def fetch_all_metals_data(start_date='2015-01-01', end_date=None):
    """
    Fetch all metals from both FRED and MetalPriceAPI.
    Returns combined DataFrame with all metal prices.
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')

    df_all = pd.DataFrame()

    # FRED metals (full history)
    print('=' * 60)
    print('FETCHING FRED DATA (Gold, Silver, Platinum, Palladium)')
    print('=' * 60)

    for metal, config in METAL_CONFIG.items():
        if config['source'] == 'fred':
            try:
                print(f"Fetching {metal}... ", end='')
                series = fred.get_series(
                    config['series_id'],
                    observation_start=start_date,
                    observation_end=end_date
                )
                df_all[f'{metal}_PRICE'] = series
                print(f"\u2713 {len(series)} records")
                time.sleep(0.5)  # Rate limiting

            except Exception as e:
                print(f"\u2717 Error: {e}")

    # MetalPriceAPI metals (limited history)
    print('\n' + '=' * 60)
    print('FETCHING METALPRICEAPI DATA (Copper, Aluminum, Nickel, Zinc)')
    print('=' * 60)
    print('Note: Free tier data starts from 2025-06-19')

    metalpriceapi_symbols = [
        config['symbol'] for metal, config in METAL_CONFIG.items()
        if config['source'] == 'metalpriceapi'
    ]

    # MetalPriceAPI free tier limitation
    metalpriceapi_start = '2025-06-19'

    if metalpriceapi_symbols:
        print(f"Fetching {len(metalpriceapi_symbols)} metals... ", end='')
        df_metalapi = fetch_metalpriceapi_timeframe(
            metalpriceapi_symbols,
            metalpriceapi_start,
            end_date,
            METALPRICEAPI_KEY
        )

        if not df_metalapi.empty:
            # Rename columns
            symbol_to_name = {
                config['symbol']: metal
                for metal, config in METAL_CONFIG.items()
                if config['source'] == 'metalpriceapi'
            }
            df_metalapi.columns = [f"{symbol_to_name[col]}_PRICE" for col in df_metalapi.columns]

            # Merge with main dataframe
            df_all = df_all.join(df_metalapi, how='outer')
            print(f"\u2713 {len(df_metalapi)} records")
        else:
            print('\u2717 Failed')

    # Sort by date
    df_all.sort_index(inplace=True)

    # Forward fill missing values (weekends, holidays)
    df_all.ffill(inplace=True)

    return df_all


# ==============================================================================
# EXECUTE DATA COLLECTION
# ==============================================================================

print('Starting multi-metal data collection...')
print(f"Date range: 2015-01-01 to {datetime.now().strftime('%Y-%m-%d')}\n")

df_metals_raw = fetch_all_metals_data(start_date='2015-01-01')

print('\n' + '=' * 60)
print('DATA COLLECTION SUMMARY')
print('=' * 60)
print(f"Total records: {len(df_metals_raw)}")
print(f"Date range: {df_metals_raw.index.min()} to {df_metals_raw.index.max()}")
print(f"Metals collected: {len(df_metals_raw.columns)}")
print(f"\nColumns:\n{list(df_metals_raw.columns)}")
print(f"\nMissing values per metal:")
print(df_metals_raw.isnull().sum())

# Display sample
print(f"\nFirst 5 rows:")
print(df_metals_raw.head())

print(f"\nLast 5 rows:")
print(df_metals_raw.tail())

# Save raw data
import os
os.makedirs(str(BASE_DIR / 'data'), exist_ok=True)
output_path = str(BASE_DIR / 'data' / 'all_metals_raw.csv')
df_metals_raw.to_csv(output_path)
print(f"\n\u2713 Raw data saved to: {output_path}")


## Geopolitical Risk Data (GPRD)

Add geopolitical risk indices from FRED and combine with metal price data.

In [None]:
# ==============================================================================
# ADD GEOPOLITICAL RISK DATA
# ==============================================================================

print('\n' + '=' * 60)
print('ADDING GEOPOLITICAL RISK DATA (GPRD)')
print('=' * 60)

# Fetch GPRD data
gprd_series_ids = {
    'GPRD': 'GEPUCURRENT',
    'GPRD_ACT': 'GPDACT',
    'GPRD_THREAT': 'GPDTHAT'
}

df_gprd = pd.DataFrame()

for name, series_id in gprd_series_ids.items():
    try:
        print(f"Fetching {name}... ", end='')
        series = fred.get_series(
            series_id,
            observation_start='2015-01-01',
            observation_end=datetime.now().strftime('%Y-%m-%d')
        )
        df_gprd[name] = series
        print(f"\u2713 {len(series)} records")
        time.sleep(0.5)

    except Exception as e:
        print(f"\u2717 Error: {e}")

# Combine metals + GPRD
df_combined = df_metals_raw.join(df_gprd, how='left')
df_combined.ffill(inplace=True)
df_combined.bfill(inplace=True)

print(f"\n\u2713 Combined dataset shape: {df_combined.shape}")
print(f"Columns: {list(df_combined.columns)}")

# Save combined data
output_path = str(BASE_DIR / 'data' / 'metals_gprd_combined.csv')
df_combined.to_csv(output_path)
print(f"\u2713 Combined data saved to: {output_path}")

# Use this for the rest of the analysis
df = df_combined.copy()


## Feature Engineering - All Metals

Create lag features, rolling statistics, time-based features, and target variables for all 8 metals.

In [None]:
# ==============================================================================
# FEATURE ENGINEERING - ALL METALS
# ==============================================================================

print('\n' + '=' * 60)
print('FEATURE ENGINEERING FOR ALL METALS')
print('=' * 60)

df_feat = df.copy()

# Define which metals we have
METAL_NAMES = ['GOLD', 'SILVER', 'PLATINUM', 'PALLADIUM', 'COPPER', 'ALUMINUM', 'NICKEL', 'ZINC']
GPRD_NAMES = ['GPRD', 'GPRD_ACT', 'GPRD_THREAT']

# ==============================================================================
# 1. LAG FEATURES (for each metal)
# ==============================================================================

print('\n1. Creating lag features...')
lag_periods = [1, 2, 3, 5, 10]

for metal in METAL_NAMES:
    price_col = f'{metal}_PRICE'
    if price_col in df_feat.columns:
        for lag in lag_periods:
            df_feat[f'{metal}_LAG_{lag}'] = df_feat[price_col].shift(lag)
        print(f"  \u2713 {metal}: {len(lag_periods)} lag features")

# GPRD lags
for gprd in GPRD_NAMES:
    if gprd in df_feat.columns:
        for lag in lag_periods:
            df_feat[f'{gprd}_LAG_{lag}'] = df_feat[gprd].shift(lag)
        print(f"  \u2713 {gprd}: {len(lag_periods)} lag features")

# ==============================================================================
# 2. ROLLING STATISTICS (for each metal)
# ==============================================================================

print('\n2. Creating rolling statistics...')
windows = [5, 10, 20, 30]

for metal in METAL_NAMES:
    price_col = f'{metal}_PRICE'
    if price_col in df_feat.columns:
        for window in windows:
            df_feat[f'{metal}_ROLL_MEAN_{window}'] = df_feat[price_col].rolling(window).mean()
            df_feat[f'{metal}_ROLL_STD_{window}'] = df_feat[price_col].rolling(window).std()
        print(f"  \u2713 {metal}: {len(windows) * 2} rolling features")

# GPRD rolling stats
for gprd in GPRD_NAMES:
    if gprd in df_feat.columns:
        for window in windows:
            df_feat[f'{gprd}_ROLL_MEAN_{window}'] = df_feat[gprd].rolling(window).mean()
            df_feat[f'{gprd}_ROLL_STD_{window}'] = df_feat[gprd].rolling(window).std()
        print(f"  \u2713 {gprd}: {len(windows) * 2} rolling features")

# ==============================================================================
# 3. TIME-BASED FEATURES
# ==============================================================================

print('\n3. Creating time-based features...')
df_feat['YEAR'] = df_feat.index.year
df_feat['MONTH'] = df_feat.index.month
df_feat['QUARTER'] = df_feat.index.quarter
df_feat['DAYOFWEEK'] = df_feat.index.dayofweek
df_feat['DAYOFYEAR'] = df_feat.index.dayofyear
print('  \u2713 5 time features')

# ==============================================================================
# 4. TARGET VARIABLES (next-day price for each metal)
# ==============================================================================

print('\n4. Creating target variables...')
for metal in METAL_NAMES:
    price_col = f'{metal}_PRICE'
    if price_col in df_feat.columns:
        df_feat[f'{metal}_TARGET'] = df_feat[price_col].shift(-1)
        print(f"  \u2713 {metal}_TARGET")

# Drop rows with NaN (from lag/rolling/target operations)
initial_rows = len(df_feat)
df_feat.dropna(inplace=True)
final_rows = len(df_feat)

print(f"\n\u2713 Feature engineering complete!")
print(f"  Rows dropped (NaN): {initial_rows - final_rows}")
print(f"  Final dataset shape: {df_feat.shape}")
print(f"  Total features: {df_feat.shape[1]}")

# Save feature-engineered data
output_path = str(BASE_DIR / 'data' / 'metals_features_engineered.csv')
df_feat.to_csv(output_path)
print(f"\u2713 Feature data saved to: {output_path}")


## Train XGBoost Models for Each Metal

Train individual XGBoost models for each of the 8 metals and evaluate performance.

In [None]:
# ==============================================================================
# TRAIN INDIVIDUAL MODELS FOR EACH METAL
# ==============================================================================

from sklearn.model_selection import train_test_split
import joblib
import os

print('\n' + '=' * 60)
print('TRAINING XGBOOST MODELS FOR ALL METALS')
print('=' * 60)

# Create models directory
os.makedirs(str(MODELS_DIR), exist_ok=True)
os.makedirs(str(BASE_DIR / 'outputs'), exist_ok=True)

# Store model performance
model_performance = {}

# Columns to exclude from features (raw prices, targets, GPRD raw)
exclude_cols = ['GOLD_PRICE', 'SILVER_PRICE', 'PLATINUM_PRICE', 'PALLADIUM_PRICE',
                'COPPER_PRICE', 'ALUMINUM_PRICE', 'NICKEL_PRICE', 'ZINC_PRICE',
                'GPRD', 'GPRD_ACT', 'GPRD_THREAT',
                'GOLD_TARGET', 'SILVER_TARGET', 'PLATINUM_TARGET', 'PALLADIUM_TARGET',
                'COPPER_TARGET', 'ALUMINUM_TARGET', 'NICKEL_TARGET', 'ZINC_TARGET']

# Train model for each metal
for metal in METAL_NAMES:
    target_col = f'{metal}_TARGET'

    # Check if we have data for this metal
    if target_col not in df_feat.columns:
        print(f"\n\u2717 Skipping {metal}: No data available")
        continue

    print(f"\n{'=' * 60}")
    print(f"TRAINING MODEL: {metal}")
    print(f"{'=' * 60}")

    # Prepare features and target
    feature_cols = [col for col in df_feat.columns if col not in exclude_cols]

    X = df_feat[feature_cols]
    y = df_feat[target_col]

    # Remove rows where target is NaN
    valid_idx = y.notna()
    X = X[valid_idx]
    y = y[valid_idx]

    print(f"Dataset size: {len(X)} samples, {len(feature_cols)} features")

    # Train/test split (80/20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False, random_state=42
    )

    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

    # Train XGBoost model
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    )

    print('Training model... ', end='')
    model.fit(X_train, y_train)
    print('\u2713 Done')

    # Evaluate
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print(f"\nPerformance:")
    print(f"  Train RMSE: ${train_rmse:.2f}")
    print(f"  Test RMSE:  ${test_rmse:.2f}")
    print(f"  Train MAE:  ${train_mae:.2f}")
    print(f"  Test MAE:   ${test_mae:.2f}")

    # Store performance
    model_performance[metal] = {
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'n_samples': len(X),
        'n_features': len(feature_cols)
    }

    # Save model
    model_path = str(MODELS_DIR / f'{metal.lower()}_xgb_model.pkl')
    joblib.dump(model, model_path)
    print(f"\u2713 Model saved: {model_path}")

    # Save feature columns for this model
    feature_cols_path = str(MODELS_DIR / f'{metal.lower()}_feature_cols.pkl')
    joblib.dump(feature_cols, feature_cols_path)
    print(f"\u2713 Feature columns saved: {feature_cols_path}")

# ==============================================================================
# MODEL PERFORMANCE SUMMARY
# ==============================================================================

print('\n' + '=' * 60)
print('MODEL PERFORMANCE SUMMARY')
print('=' * 60)

if model_performance:
    performance_df = pd.DataFrame(model_performance).T
    performance_df = performance_df.round(2)
    print(performance_df)

    # Save performance summary
    perf_path = str(BASE_DIR / 'outputs' / 'model_performance_all_metals.csv')
    performance_df.to_csv(perf_path)
    print(f"\n\u2713 Performance summary saved to: {perf_path}")
else:
    print('No models were trained (no data available).')


## 2. Load and Inspect Data

In [None]:
file_path = DATA_RAW_DIR / "Gold-Silver-GeopoliticalRisk_HistoricalData.csv"
df = pd.read_csv(file_path)

# Normalize column names
df.columns = [c.strip().upper() for c in df.columns]

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
df.info()

In [None]:
# Convert date and clean data
df['DATE'] = pd.to_datetime(df['DATE'])
df = df.sort_values('DATE').set_index('DATE')

# Select relevant columns
df = df[['GOLD_PRICE', 'SILVER_PRICE', 'GPRD', 'GPRD_ACT', 'GPRD_THREAT']]

# Forward fill then backward fill missing values
df = df.ffill().bfill()

print(f"\nData after cleaning:")
print(f"Shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nMissing values:\n{df.isnull().sum()}")

In [None]:
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Gold & Silver time series
plt.figure(figsize=(12, 4))
plt.plot(df.index, df['GOLD_PRICE'], label='Gold', alpha=0.8)
plt.plot(df.index, df['SILVER_PRICE'], label='Silver', alpha=0.8)
plt.title("Gold & Silver Spot Prices (1985–2025)", fontsize=14)
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "gold_silver_timeseries.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Geopolitical Risk Index
plt.figure(figsize=(12, 3))
plt.plot(df.index, df['GPRD'], color='crimson', alpha=0.8)
plt.title("Geopolitical Risk Index (GPRD)", fontsize=14)
plt.xlabel('Date')
plt.ylabel('GPRD')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "gprd_timeseries.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", center=0, fmt='.2f')
plt.title("Correlation Matrix", fontsize=14)
plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "corr_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

## 4. Feature Engineering

In [None]:
# Create target: next-day GOLD_PRICE
df['GOLD_TARGET'] = df['GOLD_PRICE'].shift(-1)

print(f"Target variable created: GOLD_TARGET")
print(f"First few values: {df['GOLD_TARGET'].head()}")

In [None]:
# Create lag features
lags = [1, 2, 5, 10, 20]

for lag in lags:
    df[f'GOLD_LAG_{lag}'] = df['GOLD_PRICE'].shift(lag)
    df[f'SILVER_LAG_{lag}'] = df['SILVER_PRICE'].shift(lag)
    df[f'GPRD_LAG_{lag}'] = df['GPRD'].shift(lag)

print(f"\nLag features created for lags: {lags}")
print(f"New columns: {[c for c in df.columns if 'LAG' in c][:6]}...")

In [None]:
# Create rolling features
windows = [5, 10, 20]

for window in windows:
    df[f'GOLD_ROLL_MEAN_{window}'] = df['GOLD_PRICE'].rolling(window).mean()
    df[f'GOLD_ROLL_STD_{window}'] = df['GOLD_PRICE'].rolling(window).std()
    df[f'GPRD_ROLL_MEAN_{window}'] = df['GPRD'].rolling(window).mean()

print(f"\nRolling features created for windows: {windows}")
print(f"New columns: {[c for c in df.columns if 'ROLL' in c][:6]}...")

In [None]:
# Create time-based features
df['YEAR'] = df.index.year
df['MONTH'] = df.index.month
df['DAYOFWEEK'] = df.index.dayofweek

print(f"\nTime-based features created: YEAR, MONTH, DAYOFWEEK")

In [None]:
# Drop rows with NaN values
print(f"\nBefore dropping NaNs: {df.shape}")
df_model = df.dropna().copy()
print(f"After dropping NaNs: {df_model.shape}")
print(f"Rows dropped: {len(df) - len(df_model)}")

In [None]:
df_model.info()

## 5. Train/Validation Split

In [None]:
# Time-based split: train on data before 2020, validate on 2020+
cutoff_date = "2020-01-01"

train = df_model[df_model.index < cutoff_date]
val = df_model[df_model.index >= cutoff_date]

feature_cols = [c for c in df_model.columns if c != 'GOLD_TARGET']
X_train = train[feature_cols]
y_train = train['GOLD_TARGET']
X_val = val[feature_cols]
y_val = val['GOLD_TARGET']

print(f"Train set: {X_train.shape}, period: {train.index.min()} to {train.index.max()}")
print(f"Validation set: {X_val.shape}, period: {val.index.min()} to {val.index.max()}")
print(f"\nNumber of features: {len(feature_cols)}")

## 6. Baseline Models

In [None]:
# Baseline 1: Naive forecast (today's price as tomorrow's forecast)
y_val_naive = val['GOLD_PRICE']
y_val_naive = y_val_naive.reindex(y_val.index)

rmse_naive = np.sqrt(mean_squared_error(y_val, y_val_naive))
mae_naive = mean_absolute_error(y_val, y_val_naive)

print(f"Naive Baseline (today's price):")
print(f"  RMSE: {rmse_naive:.2f}")
print(f"  MAE: {mae_naive:.2f}")

In [None]:
# Baseline 2: 5-day moving average
df_model['GOLD_MA_5'] = df_model['GOLD_PRICE'].rolling(5).mean()
val_ma = df_model.loc[y_val.index, 'GOLD_MA_5']

rmse_ma = np.sqrt(mean_squared_error(y_val, val_ma))
mae_ma = mean_absolute_error(y_val, val_ma)

print(f"\n5-Day Moving Average Baseline:")
print(f"  RMSE: {rmse_ma:.2f}")
print(f"  MAE: {mae_ma:.2f}")

## 7. Train XGBoost Model

In [None]:
# Train XGBoost model
xgb_model = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost model...")
xgb_model.fit(X_train, y_train)
print("Model training complete!")

In [None]:
# Make predictions
y_val_pred = xgb_model.predict(X_val)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae = mean_absolute_error(y_val, y_val_pred)
mape = np.mean(np.abs((y_val - y_val_pred) / y_val)) * 100
r2 = r2_score(y_val, y_val_pred)

print(f"\nXGBoost Model Performance:")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE: {mae:.2f}")
print(f"  MAPE: {mape:.2f}%")
print(f"  R²: {r2:.4f}")

print(f"\nImprovement over baselines:")
print(f"  vs Naive: {((rmse_naive - rmse) / rmse_naive * 100):.1f}% reduction in RMSE")
print(f"  vs 5-day MA: {((rmse_ma - rmse) / rmse_ma * 100):.1f}% reduction in RMSE")

## 8. Visualizations

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(14, 5))
plt.plot(y_val.index, y_val, label="Actual", alpha=0.8, linewidth=1.5)
plt.plot(y_val.index, y_val_pred, label="Predicted (XGBoost)", alpha=0.8, linewidth=1.5)
plt.title(f"Gold Price – Actual vs Predicted (RMSE={rmse:.2f}, R²={r2:.4f})", fontsize=14)
plt.xlabel('Date')
plt.ylabel('Gold Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "gold_actual_vs_predicted_xgb.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature importance
importances = xgb_model.feature_importances_
fi = pd.Series(importances, index=feature_cols).sort_values(ascending=False).head(20)

plt.figure(figsize=(10, 8))
fi.sort_values().plot(kind="barh", color='steelblue')
plt.title("Top 20 Feature Importances (XGBoost)", fontsize=14)
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "feature_importance_xgb.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Residual plot
residuals = y_val - y_val_pred

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(y_val_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(residuals, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUTS_PLOTS_DIR / "residual_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

## 9. Save Model and Forecasts

In [None]:
# Save model
model_path = MODELS_DIR / "gold_xgb_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(xgb_model, f)

print(f"Model saved to: {model_path}")

In [None]:
# Save forecasts
forecast_df = pd.DataFrame({
    "DATE": y_val.index,
    "GOLD_ACTUAL": y_val.values,
    "GOLD_PREDICTED": y_val_pred
})

forecast_path = OUTPUTS_FORECASTS_DIR / "gold_val_forecasts_xgb.csv"
forecast_df.to_csv(forecast_path, index=False)

print(f"Forecasts saved to: {forecast_path}")
print(f"\nFirst few rows:")
forecast_df.head()

In [None]:
# Save feature columns for later use
feature_info = {
    'feature_cols': feature_cols,
    'metrics': {
        'rmse': rmse,
        'mae': mae,
        'mape': mape,
        'r2': r2
    }
}

feature_path = MODELS_DIR / "feature_info.pkl"
with open(feature_path, "wb") as f:
    pickle.dump(feature_info, f)

print(f"Feature info saved to: {feature_path}")

## Summary

This notebook has:
1. Loaded and cleaned gold, silver, and geopolitical risk data
2. Created engineered features (lags, rolling stats, calendar features)
3. Built baseline models for comparison
4. Trained an XGBoost model to predict next-day gold prices
5. Evaluated model performance with multiple metrics
6. Visualized results and feature importance
7. Saved the trained model and forecasts

Next steps:
- Run the Streamlit app for interactive exploration
- Consider hyperparameter tuning
- Explore SHAP values for model interpretability
- Test on different time periods