In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from catboost import CatBoostClassifier

# Load dataset
df = pd.read_csv('human_vital_signs_dataset_2024.csv')

# Select features and target
features = ['Heart Rate', 'Body Temperature', 'Oxygen Saturation',
            'Systolic Blood Pressure', 'Diastolic Blood Pressure',
            'Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Derived_BMI']
target = 'Risk Category'

# Convert Gender to numeric
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Encode target
label_encoder = LabelEncoder()
df['Risk_Encoded'] = label_encoder.fit_transform(df[target])

# Prepare data
X = df[features]
y = df['Risk_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define categorical features (CatBoost handles these automatically)
categorical_features = ['Gender']

# Create CatBoost model
model = CatBoostClassifier(
    iterations=100,          # Number of trees
    depth=6,                # Depth of trees
    learning_rate=0.1,      # Learning rate
    random_seed=42,         # For reproducibility
    verbose=0               # No training output
)

# Train model
model.fit(X_train, y_train, cat_features=categorical_features)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results

print("CATBOOST MODEL RESULTS")

print(f"\nDataset Size: {len(df)} samples")
print(f"Training Samples: {len(X_train)}")
print(f"Test Samples: {len(X_test)}")
print(f"Features Used: {len(features)}")


print("PERFORMANCE METRICS")

print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")


print("CONFUSION MATRIX")

print("          Predicted")
print("          Low   High")
print(f"Actual Low  {cm[0,0]:4d}   {cm[0,1]:4d}")
print(f"       High {cm[1,0]:4d}   {cm[1,1]:4d}")


print("\n""CLASS DISTRIBUTION")

print(f"Low Risk in test set: {(y_test == 0).sum()} samples")
print(f"High Risk in test set: {(y_test == 1).sum()} samples")

# Feature importance

print("FEATURE IMPORTANCE (Top 5)")

feature_importance = model.get_feature_importance()
for feature, importance in zip(features, feature_importance):
    print(f"{feature:30s}: {importance:.4f}")

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.