In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import recall_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [4]:
df = pd.read_csv('output//final_data_labeled.csv')

# Select features and target variable
features = ['magnitude', 'EDA', 'HR']
target = 'label'

X = df[features]
y = df[target]

In [5]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, random_state=42)



In [6]:
# Define the hyperparameter grid for tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

In [7]:
# Use F1-score as the evaluation metric
f1_scorer = make_scorer(f1_score, greater_is_better=True, average='macro')

# Perform grid search with cross-validation for the XGBoost classifier
grid_search = GridSearchCV(
    xgb_clf,
    param_grid,
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [8]:
grid_search.fit(X_train, y_train)

# Get the best XGBoost classifier
best_xgb_clf = grid_search.best_estimator_



Fitting 5 folds for each of 972 candidates, totalling 4860 fits




In [None]:
# Train the best XGBoost classifier on the training data
best_xgb_clf.fit(X_train, y_train)

In [None]:
# Test the model on the test data
y_pred = best_xgb_clf.predict(X_test)

In [None]:
# Calculate the F1-score
xgb_f1_score = f1_score(y_test, y_pred, average='macro')
print(f"XGBoost F1-score: {xgb_f1_score}")

In [None]:
# Save the best XGBoost classifier to disk
with open('best_xgb_clf.pkl', 'wb') as f:
    pickle.dump(best_xgb_clf, f)