In [None]:
"""
Reinforcement Learning for Monetary Policy Optimization (v2)
============================================================
Complete implementation with visualizations

Authors: Leonardo Luksic, Krisha Chandnani, Ignacio Orueta
LSE - February 2026

OUTPUT PATH: /Users/leoss/Desktop/Portfolio/Website-/Central bank/Outputs
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import warnings
import copy
import os
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
DATA_PATH = '/Users/leoss/Downloads'
OUTPUT_PATH = '/Users/leoss/Desktop/Portfolio/Website-/Central bank/Outputs'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# ============================================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================================

def load_and_prepare_data(data_path):
    """Load FRED data and convert to quarterly frequency."""
    
    print("="*70)
    print("LOADING DATA")
    print("="*70)
    
    cpi = pd.read_csv(f'{data_path}/CPIAUCSL.csv', parse_dates=['observation_date'], index_col='observation_date')
    unrate = pd.read_csv(f'{data_path}/UNRATE.csv', parse_dates=['observation_date'], index_col='observation_date')
    fedfunds = pd.read_csv(f'{data_path}/FEDFUNDS-1.csv', parse_dates=['observation_date'], index_col='observation_date')
    gdp = pd.read_csv(f'{data_path}/GDPC1-1.csv', parse_dates=['observation_date'], index_col='observation_date')
    
    print(f"✓ CPI: {len(cpi)} monthly obs")
    print(f"✓ Unemployment: {len(unrate)} monthly obs")
    print(f"✓ Fed Funds: {len(fedfunds)} monthly obs")
    print(f"✓ Real GDP: {len(gdp)} quarterly obs")
    
    cpi_q = cpi.resample('QS').first()
    unrate_q = unrate.resample('QS').mean()
    fedfunds_q = fedfunds.resample('QS').mean()
    
    cpi_q['inflation'] = cpi_q['CPIAUCSL'].pct_change(4) * 100
    
    gdp.columns = ['gdp']
    gdp['gdp_trend'] = gdp['gdp'].rolling(window=40, min_periods=10, center=True).mean()
    gdp['output_gap'] = ((gdp['gdp'] - gdp['gdp_trend']) / gdp['gdp_trend']) * 100
    
    data = pd.DataFrame(index=gdp.index)
    data['output_gap'] = gdp['output_gap']
    data['inflation'] = cpi_q['inflation']
    data['unemployment'] = unrate_q['UNRATE']
    data['fed_rate'] = fedfunds_q['FEDFUNDS']
    data = data.dropna()
    
    print(f"\n✓ Merged quarterly data: {len(data)} observations")
    print(f"  Period: {data.index.min().strftime('%Y-%m')} to {data.index.max().strftime('%Y-%m')}")
    
    return data


def create_features(data, n_lags=2):
    """Create lagged features."""
    
    print("\n" + "="*70)
    print("CREATING FEATURES")
    print("="*70)
    
    df = data.copy()
    
    for lag in range(1, n_lags + 1):
        df[f'L{lag}_inflation'] = df['inflation'].shift(lag)
        df[f'L{lag}_output_gap'] = df['output_gap'].shift(lag)
        df[f'L{lag}_fed_rate'] = df['fed_rate'].shift(lag)
    
    df['inflation_change'] = df['inflation'] - df['L1_inflation']
    df['output_gap_change'] = df['output_gap'] - df['L1_output_gap']
    df['rate_change'] = df['fed_rate'] - df['L1_fed_rate']
    
    df = df.dropna()
    
    print(f"✓ Created {n_lags} lags + change features")
    print(f"✓ Final dataset: {len(df)} observations")
    
    return df


# ============================================================================
# 2. ECONOMY MODELS
# ============================================================================

def train_economy_models(df, test_ratio=0.2):
    """Train economy models with architecture search."""
    
    print("\n" + "="*70)
    print("TRAINING ECONOMY MODELS")
    print("="*70)
    
    split_idx = int(len(df) * (1 - test_ratio))
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]
    
    print(f"Train: {len(train)} obs | Test: {len(test)} obs")
    
    features_y = ['L1_output_gap', 'L2_output_gap', 'L1_inflation', 'L2_inflation', 
                  'L1_fed_rate', 'L2_fed_rate']
    
    X_train_y, y_train_y = train[features_y], train['output_gap']
    X_test_y, y_test_y = test[features_y], test['output_gap']
    
    scaler_y = StandardScaler()
    X_train_y_scaled = scaler_y.fit_transform(X_train_y)
    X_test_y_scaled = scaler_y.transform(X_test_y)
    
    best_model_y, best_r2_y = None, -np.inf
    architectures = [(32, 16), (64, 32), (64, 32, 16), (128, 64)]
    
    for arch in architectures:
        model = MLPRegressor(
            hidden_layer_sizes=arch, activation='relu', solver='adam',
            alpha=0.01, max_iter=1500, early_stopping=True,
            validation_fraction=0.15, n_iter_no_change=20,
            random_state=42, verbose=False
        )
        model.fit(X_train_y_scaled, y_train_y)
        r2 = r2_score(y_test_y, model.predict(X_test_y_scaled))
        if r2 > best_r2_y:
            best_r2_y = r2
            best_model_y = model
    
    y_pred_y = best_model_y.predict(X_test_y_scaled)
    mse_y = mean_squared_error(y_test_y, y_pred_y)
    
    print(f"\n✓ Output Gap Model (best arch: {best_model_y.hidden_layer_sizes}):")
    print(f"  Test MSE: {mse_y:.4f} | R²: {best_r2_y:.4f}")
    
    features_pi = ['output_gap', 'L1_output_gap', 'L1_inflation', 'L2_inflation',
                   'L1_fed_rate', 'L2_fed_rate']
    
    X_train_pi, y_train_pi = train[features_pi], train['inflation']
    X_test_pi, y_test_pi = test[features_pi], test['inflation']
    
    scaler_pi = StandardScaler()
    X_train_pi_scaled = scaler_pi.fit_transform(X_train_pi)
    X_test_pi_scaled = scaler_pi.transform(X_test_pi)
    
    best_model_pi, best_r2_pi = None, -np.inf
    
    for arch in architectures:
        model = MLPRegressor(
            hidden_layer_sizes=arch, activation='relu', solver='adam',
            alpha=0.01, max_iter=1500, early_stopping=True,
            validation_fraction=0.15, n_iter_no_change=20,
            random_state=42, verbose=False
        )
        model.fit(X_train_pi_scaled, y_train_pi)
        r2 = r2_score(y_test_pi, model.predict(X_test_pi_scaled))
        if r2 > best_r2_pi:
            best_r2_pi = r2
            best_model_pi = model
    
    y_pred_pi = best_model_pi.predict(X_test_pi_scaled)
    mse_pi = mean_squared_error(y_test_pi, y_pred_pi)
    
    print(f"\n✓ Inflation Model (best arch: {best_model_pi.hidden_layer_sizes}):")
    print(f"  Test MSE: {mse_pi:.4f} | R²: {best_r2_pi:.4f}")
    
    return {
        'model_y': best_model_y, 'model_pi': best_model_pi,
        'scaler_y': scaler_y, 'scaler_pi': scaler_pi,
        'features_y': features_y, 'features_pi': features_pi,
        'metrics': {'y_r2': best_r2_y, 'y_mse': mse_y, 'pi_r2': best_r2_pi, 'pi_mse': mse_pi}
    }


# ============================================================================
# 3. ENVIRONMENT
# ============================================================================

class EconomyEnv:
    """Simulated economy environment for RL."""
    
    def __init__(self, models, historical_df, 
                 inflation_target=2.0, output_gap_target=0.0,
                 omega_pi=0.5, omega_y=0.5, omega_smooth=0.05,
                 max_steps=40, n_actions=17, normalize_states=True):
        
        self.model_y = models['model_y']
        self.model_pi = models['model_pi']
        self.scaler_y = models['scaler_y']
        self.scaler_pi = models['scaler_pi']
        self.features_y = models['features_y']
        self.features_pi = models['features_pi']
        
        self.historical_df = historical_df.reset_index(drop=True)
        
        self.inflation_target = inflation_target
        self.output_gap_target = output_gap_target
        self.omega_pi = omega_pi
        self.omega_y = omega_y
        self.omega_smooth = omega_smooth
        self.max_steps = max_steps
        
        self.n_actions = n_actions
        self.rate_values = np.linspace(0, 16, n_actions)
        
        self.normalize_states = normalize_states
        self.state_dim = 6
        
        if normalize_states:
            self._compute_state_stats()
        
        self.reset()
    
    def _compute_state_stats(self):
        states = []
        for i in range(2, len(self.historical_df)):
            row_t1 = self.historical_df.iloc[i]
            row_t2 = self.historical_df.iloc[i-1]
            states.append([
                row_t1['inflation'], row_t2['inflation'],
                row_t1['output_gap'], row_t2['output_gap'],
                row_t1['fed_rate'], row_t2['fed_rate']
            ])
        states = np.array(states)
        self.state_mean = states.mean(axis=0)
        self.state_std = states.std(axis=0) + 1e-8
    
    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        
        max_start = len(self.historical_df) - 3
        start_idx = np.random.randint(2, max(3, max_start))
        
        row_t1 = self.historical_df.iloc[start_idx]
        row_t2 = self.historical_df.iloc[start_idx - 1]
        
        self.state = {
            'pi_t1': row_t1['inflation'], 'pi_t2': row_t2['inflation'],
            'y_t1': row_t1['output_gap'], 'y_t2': row_t2['output_gap'],
            'i_t1': row_t1['fed_rate'], 'i_t2': row_t2['fed_rate']
        }
        
        self.step_count = 0
        self.episode_history = []
        self.cumulative_reward = 0
        
        return self._get_state_array()
    
    def _get_state_array(self, normalize=None):
        raw = np.array([
            self.state['pi_t1'], self.state['pi_t2'],
            self.state['y_t1'], self.state['y_t2'],
            self.state['i_t1'], self.state['i_t2']
        ], dtype=np.float32)
        
        if (normalize is None and self.normalize_states) or normalize:
            return (raw - self.state_mean) / self.state_std
        return raw
    
    def get_raw_state(self):
        return self._get_state_array(normalize=False)
    
    def step(self, action):
        rate = self.rate_values[int(action)]
        
        features_y = np.array([[
            self.state['y_t1'], self.state['y_t2'],
            self.state['pi_t1'], self.state['pi_t2'],
            self.state['i_t1'], self.state['i_t2']
        ]])
        features_y_scaled = self.scaler_y.transform(features_y)
        next_y = float(self.model_y.predict(features_y_scaled)[0])
        next_y = np.clip(next_y, -10, 10)
        
        features_pi = np.array([[
            next_y, self.state['y_t1'],
            self.state['pi_t1'], self.state['pi_t2'],
            self.state['i_t1'], self.state['i_t2']
        ]])
        features_pi_scaled = self.scaler_pi.transform(features_pi)
        next_pi = float(self.model_pi.predict(features_pi_scaled)[0])
        next_pi = np.clip(next_pi, -5, 20)
        
        inflation_loss = (next_pi - self.inflation_target) ** 2
        output_loss = (next_y - self.output_gap_target) ** 2
        smoothing_loss = (rate - self.state['i_t1']) ** 2
        
        reward = -(
            self.omega_pi * inflation_loss +
            self.omega_y * output_loss +
            self.omega_smooth * smoothing_loss
        )
        
        self.episode_history.append({
            'inflation': next_pi, 'output_gap': next_y,
            'rate': rate, 'reward': reward
        })
        
        self.state = {
            'pi_t1': next_pi, 'pi_t2': self.state['pi_t1'],
            'y_t1': next_y, 'y_t2': self.state['y_t1'],
            'i_t1': rate, 'i_t2': self.state['i_t1']
        }
        
        self.step_count += 1
        self.cumulative_reward += reward
        done = self.step_count >= self.max_steps
        
        info = {'inflation': next_pi, 'output_gap': next_y, 'rate': rate}
        
        return self._get_state_array(), reward, done, info
    
    def get_episode_stats(self):
        if not self.episode_history:
            return {}
        hist = self.episode_history
        return {
            'mean_inflation': np.mean([h['inflation'] for h in hist]),
            'mean_output_gap': np.mean([h['output_gap'] for h in hist]),
            'mean_rate': np.mean([h['rate'] for h in hist]),
            'total_reward': self.cumulative_reward
        }


# ============================================================================
# 4. Q-LEARNING AGENT
# ============================================================================

class ImprovedQLearningAgent:
    def __init__(self, state_dim, n_actions, gamma=0.95,
                 epsilon_start=1.0, epsilon_end=0.10, epsilon_decay=0.998,
                 buffer_size=10000, batch_size=128, target_update_freq=20):
        
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.gamma = gamma
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        self.buffer = []
        self.buffer_size = buffer_size
        
        self.scaler = StandardScaler()
        self.scaler_fitted = False
        
        self.q_network = MLPRegressor(
            hidden_layer_sizes=(128, 64, 32), activation='relu',
            solver='adam', alpha=0.001, max_iter=1, warm_start=True,
            random_state=42
        )
        
        self.target_network = None
        self.update_count = 0
        self.initialized = False
        self.min_buffer = 200
        
    def _encode_state_action(self, state, action):
        action_onehot = np.zeros(self.n_actions)
        action_onehot[action] = 1
        return np.concatenate([state, action_onehot])
    
    def _get_all_q_values(self, state, network=None):
        if network is None:
            network = self.q_network
        
        if not self.initialized:
            return np.zeros(self.n_actions)
        
        q_values = []
        for a in range(self.n_actions):
            sa = self._encode_state_action(state, a).reshape(1, -1)
            sa_scaled = self.scaler.transform(sa)
            q_values.append(network.predict(sa_scaled)[0])
        
        return np.array(q_values)
    
    def choose_action(self, state, greedy=False):
        if not greedy and np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        
        q_values = self._get_all_q_values(state)
        return int(np.argmax(q_values))
    
    def store_transition(self, state, action, reward, next_state, done):
        self.buffer.append((state.copy(), action, reward, next_state.copy(), done))
        if len(self.buffer) > self.buffer_size:
            self.buffer.pop(0)
    
    def update(self):
        if len(self.buffer) < self.min_buffer:
            return None
        
        indices = np.random.choice(len(self.buffer), self.batch_size, replace=False)
        batch = [self.buffer[i] for i in indices]
        
        X, y = [], []
        
        for state, action, reward, next_state, done in batch:
            sa = self._encode_state_action(state, action)
            X.append(sa)
            
            if done:
                target = reward
            else:
                if self.target_network is not None:
                    next_q = self._get_all_q_values(next_state, self.target_network)
                else:
                    next_q = self._get_all_q_values(next_state)
                target = reward + self.gamma * np.max(next_q)
            
            y.append(target)
        
        X = np.array(X)
        y = np.array(y)
        
        if not self.scaler_fitted:
            self.scaler.fit(X)
            self.scaler_fitted = True
            X_scaled = self.scaler.transform(X)
            self.q_network.fit(X_scaled, y)
            self.initialized = True
            self._update_target_network()
        else:
            X_scaled = self.scaler.transform(X)
            self.q_network.fit(X_scaled, y)
        
        self.update_count += 1
        if self.update_count % self.target_update_freq == 0:
            self._update_target_network()
        
        y_pred = self.q_network.predict(X_scaled)
        loss = np.mean((y - y_pred) ** 2)
        
        return loss
    
    def _update_target_network(self):
        self.target_network = copy.deepcopy(self.q_network)
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)


# ============================================================================
# 5. TAYLOR RULE
# ============================================================================

def taylor_rule(inflation, output_gap, r_star=2.0, pi_star=2.0, alpha_pi=1.5, alpha_y=0.5):
    rate = r_star + inflation + alpha_pi * (inflation - pi_star) + alpha_y * output_gap
    return np.clip(rate, 0, 16)

def taylor_rule_inertial(inflation, output_gap, prev_rate, r_star=2.0, pi_star=2.0,
                         alpha_pi=1.5, alpha_y=0.5, rho=0.8):
    taylor_rate = r_star + inflation + alpha_pi * (inflation - pi_star) + alpha_y * output_gap
    rate = rho * prev_rate + (1 - rho) * taylor_rate
    return np.clip(rate, 0, 16)

def evaluate_policy(env, policy_fn, n_episodes=50, policy_name="Policy"):
    episode_rewards = []
    
    for _ in range(n_episodes):
        state = env.reset()
        done = False
        
        while not done:
            action = policy_fn(state, env)
            state, _, done, _ = env.step(action)
        
        episode_rewards.append(env.cumulative_reward)
    
    return {
        'name': policy_name,
        'mean_reward': np.mean(episode_rewards),
        'std_reward': np.std(episode_rewards),
        'rewards': episode_rewards
    }


# ============================================================================
# 6. TRAINING
# ============================================================================

def train_agent(env, agent, n_episodes=1500, print_every=100):
    print("\n" + "="*70)
    print("TRAINING RL AGENT")
    print("="*70)
    
    episode_rewards = []
    episode_losses = []
    
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.store_transition(state, action, reward, next_state, done)
            state = next_state
        
        loss = agent.update()
        agent.decay_epsilon()
        
        episode_rewards.append(env.cumulative_reward)
        episode_losses.append(loss if loss else 0)
        
        if (episode + 1) % print_every == 0:
            recent_avg = np.mean(episode_rewards[-print_every:])
            stats = env.get_episode_stats()
            print(f"Ep {episode+1:4d} | Avg Reward: {recent_avg:8.2f} | "
                  f"ε: {agent.epsilon:.3f} | "
                  f"π: {stats['mean_inflation']:.1f}% | y: {stats['mean_output_gap']:.1f}%")
    
    return episode_rewards, episode_losses


# ============================================================================
# 7. VISUALIZATION
# ============================================================================

def plot_training(episode_rewards, episode_losses, save_path=None):
    fig, axes = plt.subplots(2, 1, figsize=(12, 8))
    window = 20
    
    ax1 = axes[0]
    ax1.plot(episode_rewards, alpha=0.3, color='steelblue')
    if len(episode_rewards) >= window:
        smooth = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
        ax1.plot(range(window-1, len(episode_rewards)), smooth,
                color='steelblue', linewidth=2, label=f'{window}-ep moving avg')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Episode Reward')
    ax1.set_title('Training Rewards')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    ax2 = axes[1]
    valid_losses = [l for l in episode_losses if l and l > 0]
    if valid_losses:
        ax2.plot(valid_losses, color='coral', alpha=0.5)
        if len(valid_losses) >= window:
            smooth_loss = np.convolve(valid_losses, np.ones(window)/window, mode='valid')
            ax2.plot(range(window-1, len(valid_losses)), smooth_loss, color='coral', linewidth=2)
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Q-Learning Loss')
    ax2.set_title('Training Loss')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"✓ Saved {save_path}")
    return fig


def plot_policy_comparison(env, agent, n_steps=40, seed=42, save_path=None):
    def rl_policy(state, env):
        return agent.choose_action(state, greedy=True)
    
    def taylor_policy(state, env):
        raw = env.get_raw_state()
        rate = taylor_rule(raw[0], raw[2])
        return np.argmin(np.abs(env.rate_values - rate))
    
    def taylor_inertial_policy(state, env):
        raw = env.get_raw_state()
        rate = taylor_rule_inertial(raw[0], raw[2], raw[4])
        return np.argmin(np.abs(env.rate_values - rate))
    
    policies = [
        (rl_policy, "RL Agent", 'steelblue', '-'),
        (taylor_policy, "Taylor Rule", 'coral', '--'),
        (taylor_inertial_policy, "Inertial Taylor", 'forestgreen', ':')
    ]
    
    fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
    
    for policy_fn, name, color, ls in policies:
        state = env.reset(seed=seed)
        history = {'inflation': [], 'output_gap': [], 'rate': []}
        
        for _ in range(n_steps):
            action = policy_fn(state, env)
            state, _, done, info = env.step(action)
            history['inflation'].append(info['inflation'])
            history['output_gap'].append(info['output_gap'])
            history['rate'].append(info['rate'])
            if done:
                break
        
        quarters = range(len(history['inflation']))
        axes[0].plot(quarters, history['inflation'], ls, color=color, linewidth=2, label=name)
        axes[1].plot(quarters, history['output_gap'], ls, color=color, linewidth=2, label=name)
        axes[2].plot(quarters, history['rate'], ls, color=color, linewidth=2, label=name)
    
    axes[0].axhline(2.0, color='green', linestyle=':', alpha=0.7, label='Target (2%)')
    axes[1].axhline(0.0, color='green', linestyle=':', alpha=0.7, label='Target (0%)')
    
    axes[0].set_ylabel('Inflation (%)')
    axes[0].set_title('Policy Comparison: Simulated Economy')
    axes[0].legend(loc='upper right')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].set_ylabel('Output Gap (%)')
    axes[1].legend(loc='upper right')
    axes[1].grid(True, alpha=0.3)
    
    axes[2].set_xlabel('Quarters')
    axes[2].set_ylabel('Interest Rate (%)')
    axes[2].legend(loc='upper right')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"✓ Saved {save_path}")
    return fig


def plot_reward_distribution(results_list, save_path=None):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    data = [r['rewards'] for r in results_list]
    labels = [r['name'] for r in results_list]
    
    bp = ax.boxplot(data, labels=labels, patch_artist=True)
    
    colors = ['steelblue', 'coral', 'forestgreen']
    for patch, color in zip(bp['boxes'], colors[:len(data)]):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    
    ax.set_ylabel('Episode Reward')
    ax.set_title('Policy Performance Distribution (50 episodes)')
    ax.grid(True, alpha=0.3, axis='y')
    
    if len(results_list) >= 2:
        t_stat, p_val = stats.ttest_ind(results_list[0]['rewards'], results_list[1]['rewards'])
        ax.text(0.02, 0.98, f't-test p-value: {p_val:.4f}',
               transform=ax.transAxes, fontsize=10, verticalalignment='top')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"✓ Saved {save_path}")
    return fig


# ============================================================================
# 8. MAIN
# ============================================================================

def main():
    print("\n" + "="*70)
    print("REINFORCEMENT LEARNING FOR MONETARY POLICY")
    print("="*70)
    print(f"Output path: {OUTPUT_PATH}")
    
    data = load_and_prepare_data(DATA_PATH)
    df = create_features(data)
    models = train_economy_models(df)
    
    print("\n" + "="*70)
    print("CREATING ENVIRONMENT")
    print("="*70)
    
    env = EconomyEnv(
        models=models, historical_df=df,
        inflation_target=2.0, output_gap_target=0.0,
        omega_pi=0.5, omega_y=0.5, omega_smooth=0.05,
        max_steps=40, n_actions=17, normalize_states=True
    )
    
    print(f"✓ Environment created (Actions: {env.n_actions})")
    
    agent = ImprovedQLearningAgent(
        state_dim=env.state_dim, n_actions=env.n_actions,
        gamma=0.95, epsilon_start=1.0, epsilon_end=0.10,
        epsilon_decay=0.998, buffer_size=10000, batch_size=128,
        target_update_freq=20
    )
    
    def rl_policy(state, env):
        return agent.choose_action(state, greedy=True)
    
    def taylor_policy(state, env):
        raw = env.get_raw_state()
        rate = taylor_rule(raw[0], raw[2])
        return np.argmin(np.abs(env.rate_values - rate))
    
    def taylor_inertial_policy(state, env):
        raw = env.get_raw_state()
        rate = taylor_rule_inertial(raw[0], raw[2], raw[4])
        return np.argmin(np.abs(env.rate_values - rate))
    
    print("\n" + "="*70)
    print("BASELINE EVALUATION")
    print("="*70)
    
    taylor_results = evaluate_policy(env, taylor_policy, n_episodes=50, policy_name="Taylor Rule")
    inertial_results = evaluate_policy(env, taylor_inertial_policy, n_episodes=50, policy_name="Inertial Taylor")
    
    print(f"Taylor Rule:     {taylor_results['mean_reward']:.2f} ± {taylor_results['std_reward']:.2f}")
    print(f"Inertial Taylor: {inertial_results['mean_reward']:.2f} ± {inertial_results['std_reward']:.2f}")
    
    episode_rewards, episode_losses = train_agent(env, agent, n_episodes=1500, print_every=100)
    
    print("\n" + "="*70)
    print("FINAL EVALUATION")
    print("="*70)
    
    rl_results = evaluate_policy(env, rl_policy, n_episodes=50, policy_name="RL Agent")
    
    print(f"\nResults (50 evaluation episodes):")
    print(f"  RL Agent:        {rl_results['mean_reward']:8.2f} ± {rl_results['std_reward']:.2f}")
    print(f"  Taylor Rule:     {taylor_results['mean_reward']:8.2f} ± {taylor_results['std_reward']:.2f}")
    print(f"  Inertial Taylor: {inertial_results['mean_reward']:8.2f} ± {inertial_results['std_reward']:.2f}")
    
    t_stat, p_val = stats.ttest_ind(rl_results['rewards'], taylor_results['rewards'])
    print(f"\n  RL vs Taylor t-test: t={t_stat:.2f}, p={p_val:.4f}")
    
    if rl_results['mean_reward'] > taylor_results['mean_reward']:
        imp = ((rl_results['mean_reward'] - taylor_results['mean_reward']) / 
               abs(taylor_results['mean_reward'])) * 100
        print(f"\n✓ RL Agent outperforms Taylor Rule by {imp:.1f}%")
    
    print("\n" + "="*70)
    print("SAVING VISUALIZATIONS")
    print("="*70)
    
    plot_training(episode_rewards, episode_losses, 
                  save_path=f'{OUTPUT_PATH}/training_curves.png')
    
    plot_policy_comparison(env, agent, n_steps=40,
                           save_path=f'{OUTPUT_PATH}/policy_comparison.png')
    
    plot_reward_distribution([rl_results, taylor_results, inertial_results],
                             save_path=f'{OUTPUT_PATH}/reward_distribution.png')
    
    plt.close('all')
    
    print("\n" + "="*70)
    print("COMPLETE")
    print("="*70)
    
    return env, agent, models, episode_rewards, rl_results, taylor_results, inertial_results


if __name__ == "__main__":
    results = main()


REINFORCEMENT LEARNING FOR MONETARY POLICY
Output path: /Users/leoss/Desktop/Portfolio/Website-/Central bank/Outputs
LOADING DATA
✓ CPI: 948 monthly obs
✓ Unemployment: 936 monthly obs
✓ Fed Funds: 858 monthly obs
✓ Real GDP: 315 quarterly obs

✓ Merged quarterly data: 285 observations
  Period: 1954-07 to 2025-07

CREATING FEATURES
✓ Created 2 lags + change features
✓ Final dataset: 283 observations

TRAINING ECONOMY MODELS
Train: 226 obs | Test: 57 obs

✓ Output Gap Model (best arch: (64, 32)):
  Test MSE: 1.8622 | R²: 0.7423

✓ Inflation Model (best arch: (32, 16)):
  Test MSE: 0.9391 | R²: 0.7620

CREATING ENVIRONMENT
✓ Environment created (Actions: 17)

BASELINE EVALUATION
Taylor Rule:     -194.60 ± 167.04
Inertial Taylor: -109.72 ± 115.93

TRAINING RL AGENT
Ep  100 | Avg Reward:  -282.02 | ε: 0.819 | π: 3.8% | y: -1.6%
Ep  200 | Avg Reward:  -250.62 | ε: 0.670 | π: 3.6% | y: -1.3%
Ep  300 | Avg Reward:  -227.09 | ε: 0.548 | π: 3.4% | y: -0.8%
Ep  400 | Avg Reward:  -179.41 | ε: 