asked Claude to build me a gold price predictor + backtesting and it failed miserably. Either I suck at prompting or the idea just wouldn't work. Price predictors require a great deal of mathmetical models. Might consider learning quantitative trading


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# For demonstration purposes, I'll generate synthetic data
# In practice, replace this with your actual data loading function
def generate_synthetic_gold_data():
    """
    Generate synthetic gold price data for demonstration
    Replace this function with actual data loading from your preferred source
    """
    np.random.seed(42)
    
    # Generate 20 years of hourly data
    start_date = datetime.now() - timedelta(days=365*20)
    end_date = datetime.now()
    
    # Create hourly datetime index
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    
    # Base gold price with trend and seasonal patterns
    base_price = 1200
    trend = np.linspace(0, 800, len(date_range))  # 20-year upward trend
    
    # Add various patterns
    seasonal = 50 * np.sin(2 * np.pi * np.arange(len(date_range)) / (365.25 * 24))
    daily_pattern = 20 * np.sin(2 * np.pi * np.arange(len(date_range)) / 24)
    noise = np.random.normal(0, 30, len(date_range))
    
    # Economic shock events (random spikes)
    shocks = np.zeros(len(date_range))
    shock_indices = np.random.choice(len(date_range), size=50, replace=False)
    shocks[shock_indices] = np.random.normal(0, 100, 50)
    
    prices = base_price + trend + seasonal + daily_pattern + noise + shocks
    
    # Create DataFrame
    df = pd.DataFrame({
        'datetime': date_range,
        'price': prices
    })
    
    return df

def load_and_prepare_data():
    """
    Load and prepare gold price data
    Modify this function to load your actual data
    """
    print("Loading gold price data...")
    
    # Load data (replace with actual data source)
    df = generate_synthetic_gold_data()
    
    # Filter for 6AM-7AM periods only
    df['hour'] = df['datetime'].dt.hour
    df_filtered = df[df['hour'] == 6].copy()  # 6AM data
    
    # Reset datetime to date only since we're only looking at 6AM
    df_filtered['date'] = df_filtered['datetime'].dt.date
    df_filtered = df_filtered.drop(['datetime', 'hour'], axis=1)
    df_filtered = df_filtered.set_index('date')
    
    print(f"Data loaded: {len(df_filtered)} daily 6AM observations")
    print(f"Date range: {df_filtered.index.min()} to {df_filtered.index.max()}")
    
    return df_filtered

def create_features(df):
    """
    Create features for gold price prediction
    """
    print("Creating features...")
    
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Price-based features
    for lag in [1, 2, 3, 5, 7, 14, 21, 30]:
        data[f'price_lag_{lag}'] = data['price'].shift(lag)
    
    # Moving averages
    for window in [3, 7, 14, 21, 30, 60, 90]:
        data[f'ma_{window}'] = data['price'].rolling(window=window).mean()
        data[f'price_vs_ma_{window}'] = data['price'] / data[f'ma_{window}'] - 1
    
    # Volatility features
    for window in [7, 14, 21, 30]:
        data[f'volatility_{window}'] = data['price'].rolling(window=window).std()
        data[f'price_range_{window}'] = (
            data['price'].rolling(window=window).max() - 
            data['price'].rolling(window=window).min()
        )
    
    # Price changes
    for lag in [1, 2, 3, 7]:
        data[f'price_change_{lag}'] = data['price'].pct_change(lag)
    
    # Technical indicators
    # RSI approximation
    delta = data['price'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['rsi'] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands
    data['bb_middle'] = data['price'].rolling(window=20).mean()
    data['bb_std'] = data['price'].rolling(window=20).std()
    data['bb_upper'] = data['bb_middle'] + (data['bb_std'] * 2)
    data['bb_lower'] = data['bb_middle'] - (data['bb_std'] * 2)
    data['bb_position'] = (data['price'] - data['bb_lower']) / (data['bb_upper'] - data['bb_lower'])
    
    # Time-based features
    data['day_of_week'] = pd.to_datetime(data.index).dayofweek
    data['day_of_month'] = pd.to_datetime(data.index).day
    data['month'] = pd.to_datetime(data.index).month
    data['quarter'] = pd.to_datetime(data.index).quarter
    data['year'] = pd.to_datetime(data.index).year
    
    # Cyclical encoding for time features
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    
    print(f"Created {len(data.columns)} features")
    return data

def prepare_target_variable(data):
    """
    Prepare target variable (next day's 6AM price)
    """
    # Target is next day's price
    data['target'] = data['price'].shift(-1)
    
    # Remove last row as it won't have a target
    data = data[:-1]
    
    return data

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train multiple models and evaluate their performance
    """
    print("\nTraining models...")
    
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(
            n_estimators=100, 
            max_depth=10, 
            random_state=42, 
            n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingRegressor(
            n_estimators=100, 
            max_depth=6, 
            random_state=42
        )
    }
    
    results = {}
    predictions = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
        train_mae = mean_absolute_error(y_train, train_pred)
        test_mae = mean_absolute_error(y_test, test_pred)
        train_r2 = r2_score(y_train, train_pred)
        test_r2 = r2_score(y_test, test_pred)
        
        results[name] = {
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'model': model
        }
        
        predictions[name] = test_pred
        
        print(f"Train RMSE: {train_rmse:.2f}")
        print(f"Test RMSE: {test_rmse:.2f}")
        print(f"Test R²: {test_r2:.4f}")
        print(f"Test MAE: {test_mae:.2f}")
    
    return results, predictions

def plot_results(y_test, predictions, test_dates):
    """
    Plot actual vs predicted prices
    """
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Time series comparison
    plt.subplot(2, 2, 1)
    plt.plot(test_dates[-100:], y_test[-100:], label='Actual', linewidth=2)
    for name, pred in predictions.items():
        plt.plot(test_dates[-100:], pred[-100:], label=f'{name}', alpha=0.7)
    plt.title('Last 100 Days: Actual vs Predicted Gold Prices (6AM)')
    plt.xlabel('Date')
    plt.ylabel('Price ($)')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Scatter plot for best model
    best_model = min(predictions.keys(), 
                    key=lambda x: np.sqrt(mean_squared_error(y_test, predictions[x])))
    
    plt.subplot(2, 2, 2)
    plt.scatter(y_test, predictions[best_model], alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Price ($)')
    plt.ylabel('Predicted Price ($)')
    plt.title(f'Actual vs Predicted ({best_model})')
    plt.grid(True, alpha=0.3)
    
    # Plot 3: Residuals
    residuals = y_test - predictions[best_model]
    plt.subplot(2, 2, 3)
    plt.scatter(predictions[best_model], residuals, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Price ($)')
    plt.ylabel('Residuals ($)')
    plt.title(f'Residual Plot ({best_model})')
    plt.grid(True, alpha=0.3)
    
    # Plot 4: Residual histogram
    plt.subplot(2, 2, 4)
    plt.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
    plt.xlabel('Residuals ($)')
    plt.ylabel('Frequency')
    plt.title(f'Residual Distribution ({best_model})')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def backtest_trading_strategy(data, best_model, scaler, feature_cols, initial_capital=1000, 
                            start_date='2025-01-01', end_date='2025-01-31'):
    """
    Backtest trading strategy for January 2025 using model predictions
    """
    print("\n" + "="*80)
    print("BACKTESTING TRADING STRATEGY - JANUARY 2025")
    print("="*80)
    
    # Convert string dates to datetime for comparison
    start_dt = pd.to_datetime(start_date).date()
    end_dt = pd.to_datetime(end_date).date()
    
    # Filter data for January 2025
    january_mask = (pd.to_datetime(data.index) >= start_date) & (pd.to_datetime(data.index) <= end_date)
    january_data = data[january_mask].copy()
    
    if len(january_data) == 0:
        print("No data available for January 2025 in the dataset.")
        print("This is expected since we're using synthetic data.")
        print("Simulating January 2025 trading with the last available data patterns...\n")
        
        # Use the last 31 days of available data as proxy for January 2025
        january_data = data.tail(31).copy()
        print(f"Using last {len(january_data)} days of data as January 2025 proxy")
    
    if len(january_data) < 2:
        print("Insufficient data for backtesting.")
        return
    
    print(f"Backtesting period: {january_data.index[0]} to {january_data.index[-1]}")
    print(f"Number of trading days: {len(january_data)}")
    
    # Initialize trading variables
    capital = initial_capital
    shares = 0
    position = 0  # 0 = no position, 1 = long, -1 = short
    trades = []
    portfolio_values = []
    
    # Trading parameters
    transaction_fee_rate = 0.001  # 0.1% of trading volume
    min_prediction_confidence = 0.005  # Minimum 0.5% predicted price change to trade
    
    print(f"\nTrading Parameters:")
    print(f"Initial Capital: ${capital:.2f}")
    print(f"Transaction Fee: {transaction_fee_rate*100:.1f}% of trading volume")
    print(f"Minimum Prediction Confidence: {min_prediction_confidence*100:.1f}%")
    
    # Prepare features for prediction
    X_jan = january_data[feature_cols].dropna()
    
    # Make sure we have the right data alignment
    X_jan_scaled = scaler.transform(X_jan)
    predictions = best_model.predict(X_jan_scaled)
    
    print(f"\nDaily Trading Log:")
    print("-" * 80)
    print(f"{'Date':<12} {'Current':<8} {'Predicted':<10} {'Signal':<8} {'Shares':<8} {'Capital':<10} {'Total Value':<12}")
    print("-" * 80)
    
    for i in range(len(X_jan) - 1):  # -1 because we need next day's actual price
        current_date = X_jan.index[i]
        current_price = january_data.loc[current_date, 'price']
        predicted_price = predictions[i]
        
        # Calculate expected return
        expected_return = (predicted_price - current_price) / current_price
        
        # Trading decision logic
        signal = "HOLD"
        
        # Only trade if prediction confidence is above threshold
        if abs(expected_return) > min_prediction_confidence:
            if expected_return > 0 and position <= 0:  # Buy signal
                # Close short position if any
                if position == -1:
                    trading_volume = abs(shares * current_price)
                    trading_fee = trading_volume * transaction_fee_rate
                    capital += shares * current_price - trading_fee
                    trades.append({
                        'date': current_date,
                        'action': 'cover_short',
                        'shares': -shares,
                        'price': current_price,
                        'volume': trading_volume,
                        'fee': trading_fee,
                        'capital': capital
                    })
                    shares = 0
                
                # Open long position
                # Calculate shares we can buy after accounting for trading fee
                # Let x = shares to buy
                # cost = x * price + (x * price) * fee_rate = x * price * (1 + fee_rate)
                # capital = x * price * (1 + fee_rate)
                # x = capital / (price * (1 + fee_rate))
                shares_to_buy = capital / (current_price * (1 + transaction_fee_rate))
                if shares_to_buy > 0:
                    trading_volume = shares_to_buy * current_price
                    trading_fee = trading_volume * transaction_fee_rate
                    shares = shares_to_buy
                    capital = capital - trading_volume - trading_fee
                    position = 1
                    signal = "BUY"
                    trades.append({
                        'date': current_date,
                        'action': 'buy',
                        'shares': shares,
                        'price': current_price,
                        'volume': trading_volume,
                        'fee': trading_fee,
                        'capital': capital
                    })
            
            elif expected_return < 0 and position >= 0:  # Sell signal
                # Close long position if any
                if position == 1:
                    trading_volume = shares * current_price
                    trading_fee = trading_volume * transaction_fee_rate
                    capital = trading_volume - trading_fee
                    trades.append({
                        'date': current_date,
                        'action': 'sell',
                        'shares': shares,
                        'price': current_price,
                        'volume': trading_volume,
                        'fee': trading_fee,
                        'capital': capital
                    })
                    shares = 0
                
                # Open short position (if allowed - commenting out for simplicity)
                # shares_to_short = capital / (current_price * (1 + transaction_fee_rate))
                # shares = -shares_to_short
                # position = -1
                # signal = "SHORT"
                position = 0
                signal = "SELL"
        
        # Calculate total portfolio value
        if position == 1:  # Long position
            total_value = shares * current_price
        elif position == -1:  # Short position
            total_value = capital - shares * current_price
        else:  # Cash position
            total_value = capital
        
        portfolio_values.append({
            'date': current_date,
            'total_value': total_value,
            'capital': capital,
            'shares': shares,
            'position': position,
            'current_price': current_price
        })
        
        # Print daily log (show only first 10 and last 5 days to save space)
        if i < 10 or i >= len(X_jan) - 6:
            print(f"{str(current_date):<12} ${current_price:<7.2f} ${predicted_price:<9.2f} "
                  f"{signal:<8} {shares:<7.1f} ${capital:<9.2f} ${total_value:<11.2f}")
        elif i == 10:
            print("...")
    
    # Close any remaining positions at the end
    final_date = X_jan.index[-1]
    final_price = january_data.loc[final_date, 'price']
    
    if position == 1:  # Close long position
        trading_volume = shares * final_price
        trading_fee = trading_volume * transaction_fee_rate
        capital = trading_volume - trading_fee
        trades.append({
            'date': final_date,
            'action': 'final_sell',
            'shares': shares,
            'price': final_price,
            'volume': trading_volume,
            'fee': trading_fee,
            'capital': capital
        })
        shares = 0
    elif position == -1:  # Close short position
        trading_volume = abs(shares * final_price)
        trading_fee = trading_volume * transaction_fee_rate
        capital += shares * final_price - trading_fee
        trades.append({
            'date': final_date,
            'action': 'final_cover',
            'shares': -shares,
            'price': final_price,
            'volume': trading_volume,
            'fee': trading_fee,
            'capital': capital
        })
        shares = 0
    
    final_capital = capital
    
    # Calculate performance metrics
    total_return = (final_capital - initial_capital) / initial_capital
    
    # Calculate buy-and-hold benchmark
    first_price = january_data.iloc[0]['price']
    last_price = january_data.iloc[-1]['price']
    buy_hold_return = (last_price - first_price) / first_price
    buy_hold_final = initial_capital * (1 + buy_hold_return)
    
    # Print results
    print("-" * 80)
    print("\nTRADING RESULTS SUMMARY:")
    print("="*50)
    print(f"Initial Capital: ${initial_capital:.2f}")
    print(f"Final Capital: ${final_capital:.2f}")
    print(f"Total Return: {total_return*100:.2f}%")
    print(f"Total Profit/Loss: ${final_capital - initial_capital:.2f}")
    
    print(f"\nBUY-AND-HOLD BENCHMARK:")
    print(f"Gold price start of period: ${first_price:.2f}")
    print(f"Gold price end of period: ${last_price:.2f}")
    print(f"Buy-and-hold return: {buy_hold_return*100:.2f}%")
    print(f"Buy-and-hold final value: ${buy_hold_final:.2f}")
    
    print(f"\nSTRATEGY vs BENCHMARK:")
    excess_return = total_return - buy_hold_return
    print(f"Excess return: {excess_return*100:.2f}%")
    print(f"Outperformance: ${final_capital - buy_hold_final:.2f}")
    
    print(f"\nTRADING STATISTICS:")
    print(f"Total number of trades: {len(trades)}")
    if len(trades) > 0:
        total_volume = sum([t.get('volume', 0) for t in trades])
        total_fees = sum([t.get('fee', 0) for t in trades])
        print(f"Total trading volume: ${total_volume:.2f}")
        print(f"Total trading fees: ${total_fees:.2f}")
        print(f"Trading fees as % of initial capital: {(total_fees/initial_capital)*100:.2f}%")
        print(f"Average trade size: ${total_volume/len(trades):.2f}")
        print(f"Average fee per trade: ${total_fees/len(trades):.2f}")
    
    return {
        'initial_capital': initial_capital,
        'final_capital': final_capital,
        'total_return': total_return,
        'buy_hold_return': buy_hold_return,
        'excess_return': excess_return,
        'trades': trades,
        'portfolio_values': portfolio_values
    }

def main():
    """
    Main function to run the gold price prediction pipeline
    """
    print("=== Gold Price Prediction Model (6AM-7AM Daily) ===\n")
    
    # Load and prepare data
    df = load_and_prepare_data()
    
    # Create features
    data = create_features(df)
    
    # Prepare target variable
    data = prepare_target_variable(data)
    
    # Remove rows with NaN values (due to lagged features)
    data = data.dropna()
    
    print(f"\nFinal dataset shape: {data.shape}")
    print(f"Training period: {data.index.min()} to {data.index.max()}")
    
    # Prepare features and target
    feature_cols = [col for col in data.columns if col not in ['price', 'target']]
    X = data[feature_cols]
    y = data['target']
    
    print(f"Number of features: {len(feature_cols)}")
    
    # Split data (80% train, 20% test)
    # Use time-based split to avoid data leakage
    split_date = data.index[int(len(data) * 0.8)]
    
    train_mask = data.index <= split_date
    test_mask = data.index > split_date
    
    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]
    
    print(f"\nTrain set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print(f"Train period: {X_train.index.min()} to {X_train.index.max()}")
    print(f"Test period: {X_test.index.min()} to {X_test.index.max()}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train and evaluate models
    results, predictions = train_and_evaluate_models(
        X_train_scaled, X_test_scaled, y_train, y_test
    )
    
    # Print overall results
    print("\n" + "="*80)
    print("OVERALL TRAINING RESULTS")
    print("="*80)
    
    # Create results summary
    results_df = pd.DataFrame(results).T
    print("\nModel Performance Summary:")
    print("-" * 60)
    print(f"{'Model':<20} {'Test RMSE':<12} {'Test MAE':<12} {'Test R²':<10}")
    print("-" * 60)
    
    for model_name, metrics in results.items():
        print(f"{model_name:<20} {metrics['test_rmse']:<12.2f} "
              f"{metrics['test_mae']:<12.2f} {metrics['test_r2']:<10.4f}")
    
    # Find best model
    best_model_name = min(results.keys(), 
                         key=lambda x: results[x]['test_rmse'])
    best_results = results[best_model_name]
    best_model = best_results['model']
    
    print("\n" + "="*50)
    print(f"BEST MODEL: {best_model_name}")
    print("="*50)
    print(f"Test RMSE: ${best_results['test_rmse']:.2f}")
    print(f"Test MAE: ${best_results['test_mae']:.2f}")
    print(f"Test R²: {best_results['test_r2']:.4f}")
    
    # Calculate percentage errors
    mean_price = y_test.mean()
    rmse_pct = (best_results['test_rmse'] / mean_price) * 100
    mae_pct = (best_results['test_mae'] / mean_price) * 100
    
    print(f"\nAverage gold price in test set: ${mean_price:.2f}")
    print(f"RMSE as % of average price: {rmse_pct:.2f}%")
    print(f"MAE as % of average price: {mae_pct:.2f}%")
    
    # Feature importance for the best model
    if hasattr(best_results['model'], 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': best_results['model'].feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 Most Important Features ({best_model_name}):")
        print("-" * 40)
        for _, row in feature_importance.head(10).iterrows():
            print(f"{row['feature']:<25} {row['importance']:.4f}")
    
    # Run backtesting simulation for January 2025
    backtest_results = backtest_trading_strategy(
        data, best_model, scaler, feature_cols, 
        initial_capital=1000, 
        start_date='2025-01-01', 
        end_date='2025-01-31'
    )
    
    # Plot results
    try:
        plot_results(y_test.values, predictions, X_test.index)
    except Exception as e:
        print(f"\nNote: Could not display plots: {e}")
        print("This is normal in some environments.")
    
    print(f"\n{'='*80}")
    print("ANALYSIS COMPLETE")
    print("="*80)
    
    return results, data, scaler, backtest_results

# Run the analysis
if __name__ == "__main__":
    results, data, scaler, backtest_results = main()

ModuleNotFoundError: No module named 'pandas'