In [None]:
# Simple XGBoost Model for Patient Risk Prediction
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset
df = pd.read_csv('human_vital_signs_dataset_2024.csv')

# Select features and target
features = ['Heart Rate', 'Body Temperature', 'Oxygen Saturation',
            'Systolic Blood Pressure', 'Diastolic Blood Pressure',
            'Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Derived_BMI']
target = 'Risk Category'

# Convert Gender to numeric
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Encode target
label_encoder = LabelEncoder()
df['Risk_Encoded'] = label_encoder.fit_transform(df[target])

# Prepare data
X = df[features]
y = df['Risk_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create XGBoost model
model = xgb.XGBClassifier(
    n_estimators=100,      # Number of trees
    max_depth=6,           # Maximum depth of trees
    learning_rate=0.1,     # Learning rate
    random_state=42,       # For reproducibility
    use_label_encoder=False,
    eval_metric='logloss'
)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results
print("=" * 50)
print("XGBOOST MODEL RESULTS")
print("=" * 50)
print(f"\nDataset Size: {len(df)} samples")
print(f"Training Samples: {len(X_train)}")
print(f"Test Samples: {len(X_test)}")
print(f"Features Used: {len(features)}")

print("\n" + "-" * 50)
print("PERFORMANCE METRICS")
print("-" * 50)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

print("\n" + "-" * 50)
print("CONFUSION MATRIX")
print("-" * 50)
print("          Predicted")
print("          Low   High")
print(f"Actual Low  {cm[0,0]:4d}   {cm[0,1]:4d}")
print(f"       High {cm[1,0]:4d}   {cm[1,1]:4d}")


print("CLASS DISTRIBUTION")

print(f"Low Risk in test set: {(y_test == 0).sum()} samples")
print(f"High Risk in test set: {(y_test == 1).sum()} samples")

# Feature importance

print("FEATURE IMPORTANCE (Top 5)")
print("-" * 50)
feature_importance = model.feature_importances_
for feature, importance in zip(features, feature_importance):
    print(f"{feature:30s}: {importance:.4f}")

XGBOOST MODEL RESULTS

Dataset Size: 200020 samples
Training Samples: 160016
Test Samples: 40004
Features Used: 10

--------------------------------------------------
PERFORMANCE METRICS
--------------------------------------------------
Accuracy:  0.9976 (99.76%)
Precision: 0.9977
Recall:    0.9972
F1-Score:  0.9975

--------------------------------------------------
CONFUSION MATRIX
--------------------------------------------------
          Predicted
          Low   High
Actual Low  20980     43
       High   53   18928

--------------------------------------------------
CLASS DISTRIBUTION
--------------------------------------------------
Low Risk in test set: 21023 samples
High Risk in test set: 18981 samples

--------------------------------------------------
FEATURE IMPORTANCE (Top 5)
--------------------------------------------------
Heart Rate                    : 0.4491
Body Temperature              : 0.0006
Oxygen Saturation             : 0.0006
Systolic Blood Pressure     