In [6]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import joblib  # For model saving

# Load the data
train_data = pd.read_csv('../data/processed/undersampled_train_data.csv')

# Define features and target
rf_features = [
    "var_81", "var_146", "var_12", "var_76", "var_174", "var_34", "var_21", "var_165",
    "var_109", "var_44", "var_166", "var_198", "var_192", "var_148", "var_33", "var_80",
    "var_169", "var_115", "var_92", "var_149", "var_154", "var_121", "var_107", "var_127",
    "var_122", "var_172", "var_177", "var_36", "var_108", "var_75", "var_188", "var_123",
    "var_87", "var_197", "var_86", "var_93", "var_31"
]

xgb_features = [
    "var_6", "var_53", "var_26", "var_110", "var_99", "var_190", "var_133", "var_22",
    "var_179", "var_2", "var_94", "var_40", "var_78", "var_173", "var_184", "var_170",
    "var_0", "var_1", "var_191", "var_67", "var_118", "var_147", "var_18", "var_164",
    "var_89", "var_35", "var_48", "var_95", "var_199", "var_155", "var_32", "var_5",
    "var_91", "var_90", "var_71", "var_157", "var_162", "var_130", "var_135", "var_52"
]

X_rf = train_data[rf_features]
X_xgb = train_data[xgb_features]
y = train_data['target']

# Scale the data
scaler_rf = StandardScaler()
scaler_xgb = StandardScaler()

X_rf_scaled = scaler_rf.fit_transform(X_rf)
X_xgb_scaled = scaler_xgb.fit_transform(X_xgb)

# Stratified K-Fold Setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Calculate scale_pos_weight
class_0_count = sum(y == 0)
class_1_count = sum(y == 1)
scale_pos_weight = class_0_count / class_1_count

rf_scores, xgb_scores = [], []
rf_times, xgb_times = [], []  # Lists to store training times

# Stratified K-Fold Cross Validation
for train_idx, test_idx in skf.split(X_rf_scaled, y):
    X_rf_train, X_rf_test = X_rf_scaled[train_idx], X_rf_scaled[test_idx]
    X_xgb_train, X_xgb_test = X_xgb_scaled[train_idx], X_xgb_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Random Forest Model
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 4,
        'min_samples_leaf': 2,
        'max_features': 'sqrt',
        'bootstrap': False,
        'n_jobs': -1,
        'random_state': 42,
        'class_weight': {0: 1, 1: scale_pos_weight}
    }
    start_time = time.time()
    rf_model = RandomForestClassifier(**params_rf)
    rf_model.fit(X_rf_train, y_train)
    rf_times.append(time.time() - start_time)  # Save training time
    rf_scores.append(accuracy_score(y_test, rf_model.predict(X_rf_test)))

    # XGBoost Model
    params_xgb = {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 8,
        'colsample_bytree': 0.75,
        'subsample': 0.75,
        'gamma': 0.3,
        'reg_alpha': 0.1,
        'reg_lambda': 0.8,
        'scale_pos_weight': scale_pos_weight,
        'n_jobs': -1,
        'random_state': 42
    }
    start_time = time.time()
    xgb_model = XGBClassifier(**params_xgb)
    xgb_model.fit(X_xgb_train, y_train)
    xgb_times.append(time.time() - start_time)  # Save training time
    xgb_scores.append(accuracy_score(y_test, xgb_model.predict(X_xgb_test)))

# Print results
print(f"Random Forest Accuracy: {np.mean(rf_scores):.4f} ± {np.std(rf_scores):.4f}")
print(f"XGBoost Accuracy: {np.mean(xgb_scores):.4f} ± {np.std(xgb_scores):.4f}")

# Print training times
print(f"Random Forest Average Training Time: {np.mean(rf_times):.2f} seconds")
print(f"XGBoost Average Training Time: {np.mean(xgb_times):.2f} seconds")

# Soft Voting Model (Trained on all data at once)
voting_model = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)], voting='soft'
)
voting_model.fit(np.hstack((X_rf_scaled, X_xgb_scaled)), y)

# Save the Voting model
joblib.dump(voting_model, '../models/voting_model.pkl')

# Save the scalers
joblib.dump(scaler_rf, '../models/scaler_rf.pkl')
joblib.dump(scaler_xgb, '../models/scaler_xgb.pkl')

print("Voting model and scalers saved.")

Random Forest Accuracy: 0.7366 ± 0.0032
XGBoost Accuracy: 0.7955 ± 0.0008
Random Forest Average Training Time: 269.87 seconds
XGBoost Average Training Time: 23.85 seconds
Voting model and scalers saved.
