In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [4]:
# Load preprocessed data
X_train = np.load('../data/processed/X_train.npy')
X_test = np.load('../data/processed/X_test.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')


In [5]:
# Define models to benchmark
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Evaluate models
results = {}

In [6]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    joblib.dump(model, f'../models/{name.replace(" ", "_").lower()}.pkl')

Logistic Regression Accuracy: 0.9883
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4697
           1       0.99      0.99      0.99      4696

    accuracy                           0.99      9393
   macro avg       0.99      0.99      0.99      9393
weighted avg       0.99      0.99      0.99      9393

Support Vector Machine Accuracy: 0.9944
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4697
           1       0.99      0.99      0.99      4696

    accuracy                           0.99      9393
   macro avg       0.99      0.99      0.99      9393
weighted avg       0.99      0.99      0.99      9393

Random Forest Accuracy: 0.9991
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4697
           1       1.00      1.00      1.00      4696

    accuracy                           1.00      9393
   macro avg       1

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9987
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4697
           1       1.00      1.00      1.00      4696

    accuracy                           1.00      9393
   macro avg       1.00      1.00      1.00      9393
weighted avg       1.00      1.00      1.00      9393



In [7]:
# Compare and save results
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df.to_csv('../reports/model_benchmark_results.csv', index=False)

print("\nBenchmarking Completed. Results saved to reports/model_benchmark_results.csv")



Benchmarking Completed. Results saved to reports/model_benchmark_results.csv
