# Model Training and Evaluation
This notebook combines regression and classification tasks for training and evaluating machine learning models.

## 1. Import Required Libraries

In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib

## 2. Load Preprocessed Data

In [6]:
# Load preprocessed data
data_path = 'preprocessed_data_root_all.csv'
data = pd.read_csv(data_path)
print(f'Data loaded successfully with shape: {data.shape}')
# Prepare features and targets
X = data.drop(columns=['ID', 'max_diameter', 'Label'])
y_reg = data['max_diameter']
y_class = data['Label']
print(f'Feature matrix shape: {X.shape}')
print(f'Regression target shape: {y_reg.shape}')
print(f'Classification target shape: {y_class.shape}')

Data loaded successfully with shape: (215, 463)
Feature matrix shape: (215, 460)
Regression target shape: (215,)
Classification target shape: (215,)


## 3. Regression Analysis

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores, mae_scores, r2_scores = [], [], []
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_reg.iloc[train_idx], y_reg.iloc[test_idx]
    model_reg = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', max_depth=6, learning_rate=0.01, n_estimators=1000, subsample=0.7, colsample_bytree=0.9, min_child_weight=2, gamma=0.1, random_state=42)
    model_reg.fit(X_train, y_train)
    y_pred = model_reg.predict(X_test)
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))
print(f'Mean RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}')
print(f'Mean MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}')
print(f'Mean R2: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}')

Mean RMSE: 5.0383 ± 0.4567
Mean MAE: 3.8847 ± 0.2429
Mean R2: 0.5017 ± 0.1210


## 4. Binary Classification

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)
model_class = xgb.XGBClassifier(
    objective='multi:softprob', eval_metric='mlogloss', 
    num_class=4, max_depth=4, learning_rate=0.1, n_estimators=100, subsample=0.3,
    colsample_bytree=0.6, min_child_weight=3, gamma=0.2, random_state=42
)
model_class.fit(X_train, y_train)
y_pred = model_class.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Accuracy: 0.5116
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.86      0.73        22
           1       0.20      0.20      0.20        10
           2       0.50      0.14      0.22         7
           3       0.00      0.00      0.00         4

    accuracy                           0.51        43
   macro avg       0.33      0.30      0.29        43
weighted avg       0.45      0.51      0.46        43



## 5. Save Models

In [11]:
joblib.dump(model_reg, 'models/xgb_regressor.joblib')
joblib.dump(model_class, 'models/xgb_classifier.joblib')
print('Models saved successfully!')

Models saved successfully!
