In [388]:
class XGBoostModel:
    def __init__(self, train_data, test_data, feature_cols):
        self.train_data = train_data
        self.test_data = test_data.copy()
        self.feature_cols = feature_cols
        self.model = None
        self.results = None
        
        # Hyperparameters
        self.n_estimators = 100
        self.learning_rate = 0.1
        self.max_depth = 3
        self.subsample = 0.8
        self.colsample_bytree = 0.8
        self.threshold = 0.0005  # 0.05% daily return threshold
    
    def run_pipeline(self):
        self._train_model()
        self._generate_predictions()
        self._generate_signals()
        self._calculate_returns()
    
    def _train_model(self):
        X_train = self.train_data[self.feature_cols]
        y_train = self.train_data['target']
        
        self.model = xgb.XGBRegressor(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            random_state=42,
        )
        
        self.model.fit(X_train, y_train)
    
    def _generate_predictions(self):
        X_test = self.test_data[self.feature_cols]
        self.test_data['xgb_pred'] = self.model.predict(X_test)
    
    def _generate_signals(self): # convert predictions to trading signals
        self.test_data['signal'] = 0
        self.test_data.loc[self.test_data['xgb_pred'] > self.threshold, 'signal'] = 1
        self.test_data.loc[self.test_data['xgb_pred'] < -self.threshold, 'signal'] = -1
    
    def _calculate_returns(self):
        self.test_data['strategy_return'] = self.test_data['signal'].shift(1) * self.test_data['log_return']

# Prepare train/test split
feature_cols = fe.finished_features
model_df = features_data.dropna(subset=feature_cols + ['target']).copy()
train_start = pd.Timestamp('2015-01-01')
train_end = pd.Timestamp('2021-01-01')
test_start = pd.Timestamp('2022-01-01') 
test_end = pd.Timestamp('2024-01-01')

train_mask = (model_df.index.get_level_values('date') >= train_start) & \
            (model_df.index.get_level_values('date') <= train_end)
test_mask = (model_df.index.get_level_values('date') >= test_start) & \
            (model_df.index.get_level_values('date') < test_end)


train_data = model_df[train_mask]
test_data = model_df[test_mask]
rb_model = RuleBasedModel(test_data)
rb_model.run_pipeline()
xgb_model = XGBoostModel(train_data, test_data, feature_cols)
xgb_model.run_pipeline()
