In [5]:
# v2: Changed to 15 minute bars

# ================================
# 1. Data Retrieval and Preparation
# ================================

import os
from dotenv import load_dotenv
from tvDatafeed import TvDatafeedLive, Interval
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.metrics import mean_squared_error
import pmdarima as pm

# Load environment variables from .env file
load_dotenv()

# Retrieve TradingView credentials from environment variables
username = os.environ.get("TRADINGVIEW_USERNAME")
password = os.environ.get("TRADINGVIEW_PASSWORD")

# Initialize TradingView live data feed
tv = TvDatafeedLive(username, password)

# Fetch historical data for XAUUSD (2000 hourly bars)
prices = tv.get_hist(symbol='XAUUSD', exchange='ICMARKETS',
                     interval=Interval.in_1_hour, n_bars=5000)

# Ensure the 'close' column is in numeric format
prices['close'] = pd.to_numeric(prices['close'], errors='coerce')

# Drop any rows with NaN values in 'close'
prices.dropna(subset=['close'], inplace=True)

# Reset index if necessary
prices.reset_index(drop=True, inplace=True)

# Verify total data points after cleaning
total_data_points = prices.shape[0]
print(f"Total data points after cleaning: {total_data_points}")

# ================================
# 2. Hyperparameter Optimization
# ================================

# Define error metric: Root Mean Squared Error (RMSE)
def calculate_rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

# Define optimization parameters
fixed_train_size = 4000       # Fixed training size
fixed_seasonal = False        # Fixed seasonality
forecast_elements = 40        # Number of data points to forecast (20 or 40 = 5 or 10 hours)
shift_step = 12               # Shift each run by 10 data points
num_runs = 40                 # Number of rolling window iterations

# Define ranges for Savitzky-Golay filter parameters
window_lengths = list(range(9, 40, 2))   # [9, 11, 13, ..., 39]
polyorders = [1]                         # [0, 1, 2]

# Initialize a list to store results
results = []

# Loop through each combination of window_length and polyorder
for window_length in window_lengths:
    for polyorder in polyorders:
        # Ensure that window_length is greater than polyorder
        if window_length <= polyorder:
            print(f"\nSkipping window_length={window_length}, polyorder={polyorder} as window_length <= polyorder.")
            continue
        
        # Initialize a list to store RMSE for each run
        rmse_list = []
        
        print(f"\nOptimizing for window_length={window_length}, polyorder={polyorder}...")
        
        for run in range(num_runs):
            # Define shift for rolling window
            shift = run * shift_step
            
            # Calculate indices for slicing using positive indices
            start_idx = total_data_points - (fixed_train_size + forecast_elements + shift)
            end_train_idx = total_data_points - (forecast_elements + shift)
            test_start_idx = end_train_idx
            test_end_idx = end_train_idx + forecast_elements
            
            # Ensure indices are within bounds
            if start_idx < 0 or test_end_idx > total_data_points:
                print(f"  Skipping run {run} due to insufficient data (shift={shift}).")
                continue
            
            # Slice the data for training and testing
            train_prices = prices.iloc[start_idx:end_train_idx]
            test_prices = prices.iloc[test_start_idx:test_end_idx]
            
            # Verify that slicing has the correct number of data points
            if len(train_prices) != fixed_train_size or len(test_prices) != forecast_elements:
                print(f"  Run {run} failed: Incorrect slice sizes (train_size={len(train_prices)}, test_size={len(test_prices)}).")
                rmse_list.append(np.nan)
                continue
            
            # Extract the trend component using Savitzky-Golay filter on training data
            try:
                trend_train = savgol_filter(train_prices['close'], window_length=window_length, polyorder=polyorder)
            except ValueError as e:
                print(f"  Run {run} failed during Savitzky-Golay filter on training data: {e}")
                rmse_list.append(np.nan)
                continue
            
            # Fit Auto-ARIMA model
            try:
                smodel = pm.auto_arima(
                    trend_train,
                    start_p=1, start_q=1,
                    max_p=4, max_q=4,
                    seasonal=fixed_seasonal,
                    m=24 if fixed_seasonal else 1,    # Set m=24 for daily seasonality if seasonal=True
                    d=None,
                    D=1 if fixed_seasonal else 0,
                    trace=False,
                    error_action='ignore',
                    suppress_warnings=True,
                    stepwise=True
                )
                
                # Forecast the next 24 data points (trend_forecast)
                forecast_trend = smodel.predict(n_periods=forecast_elements)
            except Exception as e:
                print(f"  Run {run} failed during Auto-ARIMA fitting or forecasting: {e}")
                rmse_list.append(np.nan)
                continue
            
            # Extract the actual trend for the test data
            try:
                # Concatenate train and test close prices
                combined_prices = pd.concat([train_prices['close'], test_prices['close']])
                
                # Apply the Savitzky-Golay filter to the combined data
                trend_total = savgol_filter(combined_prices, window_length=window_length, polyorder=polyorder)
                
                # Extract the trend for the test data (last 24 elements)
                trend_test = trend_total[-forecast_elements:]
            except ValueError as e:
                print(f"  Run {run} failed during Savitzky-Golay filter on combined data: {e}")
                rmse_list.append(np.nan)
                continue
            
            # Calculate RMSE between forecasted trend and actual test trend
            rmse = calculate_rmse(trend_test, forecast_trend)
            rmse_list.append(rmse)
        
        # Calculate average RMSE across all runs, ignoring NaN values
        if rmse_list:
            avg_rmse = np.nanmean(rmse_list)
        else:
            avg_rmse = np.nan
        
        # Store the results
        results.append({
            'window_length': window_length,
            'polyorder': polyorder,
            'avg_rmse': avg_rmse
        })
        
        print(f"  Completed: window_length={window_length}, polyorder={polyorder}, avg_rmse={avg_rmse:.4f}")


Total data points after cleaning: 5000

Optimizing for window_length=9, polyorder=1...
  Completed: window_length=9, polyorder=1, avg_rmse=14.2563

Optimizing for window_length=11, polyorder=1...
  Completed: window_length=11, polyorder=1, avg_rmse=15.3665

Optimizing for window_length=13, polyorder=1...
  Completed: window_length=13, polyorder=1, avg_rmse=14.5405

Optimizing for window_length=15, polyorder=1...
  Completed: window_length=15, polyorder=1, avg_rmse=15.4419

Optimizing for window_length=17, polyorder=1...
  Completed: window_length=17, polyorder=1, avg_rmse=13.6919

Optimizing for window_length=19, polyorder=1...
  Completed: window_length=19, polyorder=1, avg_rmse=14.1228

Optimizing for window_length=21, polyorder=1...
  Completed: window_length=21, polyorder=1, avg_rmse=13.8252

Optimizing for window_length=23, polyorder=1...
  Completed: window_length=23, polyorder=1, avg_rmse=14.6427

Optimizing for window_length=25, polyorder=1...
  Completed: window_length=25, pol

KeyboardInterrupt: 

In [6]:
results

[{'window_length': 9, 'polyorder': 1, 'avg_rmse': 14.256254459904557},
 {'window_length': 11, 'polyorder': 1, 'avg_rmse': 15.366522553487993},
 {'window_length': 13, 'polyorder': 1, 'avg_rmse': 14.540475285183401},
 {'window_length': 15, 'polyorder': 1, 'avg_rmse': 15.441859440444748},
 {'window_length': 17, 'polyorder': 1, 'avg_rmse': 13.691855210225066},
 {'window_length': 19, 'polyorder': 1, 'avg_rmse': 14.122827457982925},
 {'window_length': 21, 'polyorder': 1, 'avg_rmse': 13.825150597173637},
 {'window_length': 23, 'polyorder': 1, 'avg_rmse': 14.64266626538905},
 {'window_length': 25, 'polyorder': 1, 'avg_rmse': 14.638186274080425},
 {'window_length': 27, 'polyorder': 1, 'avg_rmse': 14.800861121036183},
 {'window_length': 29, 'polyorder': 1, 'avg_rmse': 15.552618143986933}]

In [4]:
results_20 = results
results_20

[{'window_length': 9, 'polyorder': 1, 'avg_rmse': 10.358556006131545},
 {'window_length': 11, 'polyorder': 1, 'avg_rmse': 10.207231999298028},
 {'window_length': 13, 'polyorder': 1, 'avg_rmse': 9.793895187059693},
 {'window_length': 15, 'polyorder': 1, 'avg_rmse': 10.337611023151792},
 {'window_length': 17, 'polyorder': 1, 'avg_rmse': 9.884226701402419},
 {'window_length': 19, 'polyorder': 1, 'avg_rmse': 10.128105257276419},
 {'window_length': 21, 'polyorder': 1, 'avg_rmse': 9.937187600675857},
 {'window_length': 23, 'polyorder': 1, 'avg_rmse': 10.224425267303285},
 {'window_length': 25, 'polyorder': 1, 'avg_rmse': 10.314315399853154},
 {'window_length': 27, 'polyorder': 1, 'avg_rmse': 10.343708970353992},
 {'window_length': 29, 'polyorder': 1, 'avg_rmse': 10.601500090018638},
 {'window_length': 31, 'polyorder': 1, 'avg_rmse': 10.589824084868155},
 {'window_length': 33, 'polyorder': 1, 'avg_rmse': 10.955053470108707},
 {'window_length': 35, 'polyorder': 1, 'avg_rmse': 11.04907115538840