# Classification Analysis
This notebook combines binary classification, multi-class classification, and baseline model tasks using XGBoost.

## 1. Import Required Libraries

In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

## 2. Load and Preprocess Data

In [5]:
# Load preprocessed data
data_path = 'preprocessed_data_root_all.csv'
data = pd.read_csv(data_path)
print(f'Data loaded successfully with shape: {data.shape}')
# Prepare features and targets
X = data.drop(columns=['ID', 'max_diameter', 'Label'])
y_binary = (data['max_diameter'] >= 45).astype(int)  # Binary classification
y_multi = data['Label']  # Multi-class classification
print(f'Feature matrix shape: {X.shape}')
print(f'Binary target shape: {y_binary.shape}')
print(f'Multi-class target shape: {y_multi.shape}')

Data loaded successfully with shape: (215, 463)
Feature matrix shape: (215, 460)
Binary target shape: (215,)
Multi-class target shape: (215,)


## 3. Binary Classification

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
model_binary = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=3, learning_rate=0.01, n_estimators=1000, subsample=0.1, colsample_bytree=0.2)
model_binary.fit(X_train, y_train)
y_pred = model_binary.predict(X_test)
y_pred_proba = model_binary.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Binary Classification Accuracy: {accuracy:.4f}')
print(f'Binary Classification ROC AUC Score: {roc_auc:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Binary Classification Accuracy: 0.7442
Binary Classification ROC AUC Score: 0.8182
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.81      0.83        32
           1       0.50      0.55      0.52        11

    accuracy                           0.74        43
   macro avg       0.67      0.68      0.67        43
weighted avg       0.75      0.74      0.75        43



## 4. Multi-Class Classification

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, test_size=0.2, random_state=42, stratify=y_multi)
model_multi = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', num_class=4, max_depth=4, learning_rate=0.1, n_estimators=100, subsample=0.3, colsample_bytree=0.6, min_child_weight=3, gamma=0.2, random_state=42)
model_multi.fit(X_train, y_train)
y_pred = model_multi.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Multi-Class Classification Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Multi-Class Classification Accuracy: 0.5116
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.86      0.73        22
           1       0.20      0.20      0.20        10
           2       0.50      0.14      0.22         7
           3       0.00      0.00      0.00         4

    accuracy                           0.51        43
   macro avg       0.33      0.30      0.29        43
weighted avg       0.45      0.51      0.46        43



## 5. Baseline Model

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y_multi)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_multi.iloc[train_idx], y_multi.iloc[test_idx]
    model_baseline = xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, max_depth=6, learning_rate=0.1, n_estimators=1000, subsample=0.7, colsample_bytree=0.9, min_child_weight=2, gamma=0.1, random_state=42)
    model_baseline.fit(X_train, y_train)
    y_pred = model_baseline.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
print(f'Mean Baseline Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}')

Mean Baseline Accuracy: 0.5628 ± 0.0271
