In [1]:
# This is necessary to recognize the modules
import os
import sys
import warnings

import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(root_path)

In [8]:
# Load data
connector_name = "binance"
trading_pair = "BTC-USDT"
interval = "1s"

df_with_features = pd.read_parquet(
    os.path.join(root_path, "data", "features_df", f"{connector_name}|{trading_pair}|{interval}.parquet")
)

# Prepare features and target
feature_columns = [
    col
    for col in df_with_features.columns
    if col not in ["timestamp", "tl", "stop_loss_time", "take_profit_time", "close_time", "close_type", "real_class", "ret"]
]

X = df_with_features[feature_columns]
y = df_with_features["close_type"]

# Print initial class distribution
print("Initial class distribution:")
print(y.value_counts().sort_index())
print("\n")


# Get the size of the smaller classes
target_size = df_with_features[df_with_features["close_type"] != 0].shape[0] // 2
df_neg = df_with_features[df_with_features["close_type"] == -1]
df_pos = df_with_features[df_with_features["close_type"] == 1]
df_mid = df_with_features[df_with_features["close_type"] == 0].sample(n=target_size, random_state=42)

# Combine the balanced dataset
balanced_df = pd.concat([df_neg, df_mid, df_pos])

X_balanced = balanced_df[feature_columns]
y_balanced = balanced_df["close_type"]

# Print balanced distribution
print("Balanced class distribution:")
print(y_balanced.value_counts().sort_index())
print("\n")


# Split the data first
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.4, random_state=42, shuffle=True)


# Initialize and train XGBoost
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=3,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced",
)
# Train the model
print("Training model...")
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


# Print model performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and scaler
print("\nSaving model and scaler...")
model_path = os.path.join(root_path, "models", f"{connector_name}_{trading_pair}_{interval}_xgb_model.joblib")
scaler_path = os.path.join(root_path, "models", f"{connector_name}_{trading_pair}_{interval}_scaler.joblib")

# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Save both model and scaler
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")

Initial class distribution:
close_type
-1     63411
 0    112671
 1     64574
Name: count, dtype: int64


Balanced class distribution:
close_type
-1    63411
 0    63992
 1    64574
Name: count, dtype: int64


Training model...

Classification Report:
              precision    recall  f1-score   support

          -1       0.50      0.24      0.32     25212
           0       0.46      0.76      0.57     25566
           1       0.48      0.43      0.45     26013

    accuracy                           0.47     76791
   macro avg       0.48      0.47      0.45     76791
weighted avg       0.48      0.47      0.45     76791


Saving model and scaler...
Model saved to: /Users/dman/Documents/code/quants-lab/models/binance_BTC-USDT_1s_xgb_model.joblib


In [9]:
y_test.value_counts()

close_type
 1    26013
 0    25566
-1    25212
Name: count, dtype: int64

In [11]:
y_train.value_counts()

close_type
 1    38561
 0    38426
-1    38199
Name: count, dtype: int64