## **Import Required Libraries**

In [128]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

## **Load the Datasets**

In [129]:
train_data=pd.read_csv("C:\\UOC pdf\\4th Year\\Machine Learning-02\\Data-Analysis-Project-2\\data\\train_data.csv")
test_data=pd.read_csv("C:\\UOC pdf\\4th Year\\Machine Learning-02\\Data-Analysis-Project-2\\data\\test_data.csv")

In [130]:
train_data=train_data.drop(columns=["Unnamed: 0"],axis=1)
test_data=test_data.drop(columns=["Unnamed: 0"],axis=1)

## **Split X_train,Y_train,X_test,Y_test**

In [131]:
X_train=train_data.drop(columns=["diagnosis"],axis=1)
Y_train=train_data["diagnosis"]
X_test=test_data.drop(columns=["diagnosis"],axis=1)
Y_test=test_data["diagnosis"]

In [132]:
print(f"Shape of the Training set:{X_train.shape,Y_train.shape}")
print(f"Shape of  the Testing set:{X_test.shape,Y_test.shape}")

Shape of the Training set:((700, 15), (700,))
Shape of  the Testing set:((300, 15), (300,))


## **Build the Pipeline**

In [133]:
num_cols=X_train.select_dtypes(include="number").columns  ##get the all numerical column names
cat_cols=X_train.select_dtypes(include=["object","category"]).columns ##get the all categorical column names

In [134]:
nominal_cols=["gender","pem_present","meditation_or_mindfulness"] ##nominal columns
ordinal_cols=["work_status","social_activity_level","exercise_frequency"] ##ordinal columns

#### **Define the Pipeline**

In [135]:
ordinal_pipeline=Pipeline(steps=[
    ("Ordinal Encoder",OrdinalEncoder())      ##Ordinal variable pipeline
])
nominal_pipeline=Pipeline(steps=[
    ("Nominal Encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))  ##Nominal variable pipeline
])

#### **Combine Pipeline**

In [136]:
Transfomers=ColumnTransformer(transformers=[
    ("Ordinal Pipeline",ordinal_pipeline,ordinal_cols), ##Combine the Ordinal Pipeline with Ordinal Columns
    ("Nominal Pipeline",nominal_pipeline,nominal_cols)   ##Combine the Nominal Pipeline with Nominal Columns
],
   n_jobs=-1,
   verbose=True,
   remainder="drop"
)

#### **End Pipeline**

In [137]:
final_pipeline=Pipeline(steps=[
    ("Preprocessing",Transfomers),  ##Preprocessing Step
    ("Resampling SMOTE",SMOTE(random_state=42)),  ##SMOTE Resampling Method
    ("XGB Classifier",XGBClassifier(reg_alpha=1,reg_lambda=1,min_child_weight=3)) ##Apply XGB Classifier
])

#### **Make the Reponse variable as 0,1,2**

In [138]:
Y_train=pd.DataFrame(Y_train)
Y_test=pd.DataFrame(Y_test)

In [139]:
## 1.Resposne variable
# Replace 0.0 with 'Both', 1.0 with 'Depression' and 2.0 with ME/CFS
Y_train["diagnosis"] = Y_train["diagnosis"].map({"Both":0,"Depression":1,"ME/CFS":2})
Y_test["diagnosis"] = Y_test["diagnosis"].map({"Both":0,"Depression":1,"ME/CFS":2})

#### **Execute Pipeline**

In [140]:
final_pipeline.fit(X_train,Y_train)  ## Execute the Final Pipeline

## **Model Evaluation**

In [141]:
Y_pred=final_pipeline.predict(X_test) ##get the prediction values

In [142]:
report_xgb=classification_report(Y_test,Y_pred) ##get the classification report for predictions

In [143]:
Y_pred_train=final_pipeline.predict(X_train) ##get the preidction value for training set

In [144]:
report_xgb_train=classification_report(Y_train,Y_pred_train) ##get the classification report for training 

#### **Get the Evaluation Metrics for Testing Data**

In [145]:
f1_test=f1_score(Y_test,Y_pred,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred,average="weighted") ## recall score for testing data

In [146]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score: {recall_test:.4f}")

Accuracy:  0.8100
Precision: 0.7998
F1 Score:  0.8034
Recall Score: 0.8100


#### **Get the Evaluation Metrics for Training Data**

In [147]:
f1_train=f1_score(Y_train,Y_pred_train,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train,average="weighted") ##recall score for training data

In [148]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8657
Precision: 0.8627
F1 Score:  0.8623
Recall Score:0.8657


In [149]:
### print the classification report for testing data
print(report_xgb)

              precision    recall  f1-score   support

           0       0.47      0.37      0.41        54
           1       1.00      1.00      1.00       125
           2       0.74      0.81      0.77       121

    accuracy                           0.81       300
   macro avg       0.74      0.73      0.73       300
weighted avg       0.80      0.81      0.80       300



In [150]:
## print the classification report for training data
print(report_xgb_train)

              precision    recall  f1-score   support

           0       0.72      0.59      0.65       148
           1       1.00      1.00      1.00       276
           2       0.80      0.88      0.84       276

    accuracy                           0.87       700
   macro avg       0.84      0.82      0.83       700
weighted avg       0.86      0.87      0.86       700



## **Hyperparameter Tune using Grid SearchCV**

In [154]:
param_grid = {
    'XGB Classifier__n_estimators': [100, 200],
    'XGB Classifier__max_depth': [3, 5, 7],
    'XGB Classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'XGB Classifier__subsample': [0.6, 0.8, 1.0],
    'XGB Classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'XGB Classifier__reg_alpha': [0, 0.1, 1, 10],     # L1 regularization
    'XGB Classifier__reg_lambda': [0, 0.1, 1, 10]     # L2 regularization
}

In [155]:
grid_search=GridSearchCV(final_pipeline,param_grid=param_grid,cv=5,scoring="f1_weighted")

#### **Execute the Pipeline**

In [156]:
grid_search.fit(X_train,Y_train)

#### **Best Model**

In [158]:
best_parameters=grid_search.best_params_
print(best_parameters)

{'XGB Classifier__colsample_bytree': 0.8, 'XGB Classifier__learning_rate': 0.2, 'XGB Classifier__max_depth': 5, 'XGB Classifier__n_estimators': 200, 'XGB Classifier__reg_alpha': 0, 'XGB Classifier__reg_lambda': 0.1, 'XGB Classifier__subsample': 0.6}


In [159]:
best_model=grid_search.best_estimator_

In [160]:
best_model

## **Model Evaluation**

In [161]:
Y_pred_cv=best_model.predict(X_test) ##get the prediction values

In [162]:
report_xgb_cv=classification_report(Y_test,Y_pred_cv) ##get the classification report for predictions

In [163]:
Y_pred_train_cv=best_model.predict(X_train) ##get the preidction value for training set

In [164]:
report_xgb_train_cv=classification_report(Y_train,Y_pred_train_cv) ##get the classification report for training 

#### **Get the Evaluation Metrics for Testing Data**

In [165]:
f1_test=f1_score(Y_test,Y_pred_cv,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred_cv,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred_cv) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred_cv,average="weighted") ## recall score for testing data

In [166]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score: {recall_test:.4f}")

Accuracy:  0.8100
Precision: 0.7981
F1 Score:  0.8018
Recall Score: 0.8100


#### **Get the Evaluation Metrics for Training Data**

In [167]:
f1_train=f1_score(Y_train,Y_pred_train_cv,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train_cv,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train_cv) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train_cv,average="weighted") ##recall score for training data

In [168]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8714
Precision: 0.8692
F1 Score:  0.8668
Recall Score:0.8714


## **Hyperparameter Tune Using Optuna**

In [169]:
import optuna
from sklearn.model_selection import cross_val_score

In [170]:
def objective(trials):
    ##define parameters 
    params = {
        "n_estimators": trials.suggest_int("n_estimators", 100, 500),
        "max_depth": trials.suggest_int("max_depth", 3, 15),
        "learning_rate": trials.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trials.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trials.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trials.suggest_float("gamma", 0, 5),
        "reg_alpha": trials.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trials.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_child_weight": trials.suggest_int("min_child_weight", 1, 20),
        "scale_pos_weight": trials.suggest_float("scale_pos_weight", 1, 10),
        "reg_alpha": trials.suggest_float("reg_alpha", 1e-8, 10.0, log=True),     # L1 regularization
        "reg_lambda": trials.suggest_float("reg_lambda", 1e-8, 10.0, log=True),   # L2 regularization
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "logloss"  # or "mlogloss" for multiclass
    }

    ##Build the pipeline
    pipeline_optuna=Pipeline(steps=[
        ("Preprocessing",Transfomers),
        ("Resampling SMOTE",SMOTE(random_state=42)),
        ("Random Forest Classifier",XGBClassifier(**params,random_state=42))
    ])

    ##cross validation score
    score=cross_val_score(pipeline_optuna,X_train,Y_train,cv=5,scoring="f1_weighted")
    return score.mean()

#### **Run the Optimizer**

In [171]:
study=optuna.create_study(
    study_name="xgb_model"
    ,direction="maximize")
study.optimize(objective,n_trials=50) 

[I 2025-07-25 22:28:34,739] A new study created in memory with name: xgb_model
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-25 22:28:39,594] Trial 0 finished with value: 0.7072013540670516 and parameters: {'n_estimators': 196, 'max_depth': 14, 'learning_rate': 0.13406235759072194, 'subsample': 0.8403079245137743, 'colsample_bytree': 0.8324942641278905, 'gamma': 4.80214839261941, 'reg_alpha': 8.786298974032826e-08, 'reg_lambda': 0.009211073052572594, 'min_

#### **Best Model**

In [172]:
best_model_optuna=study.best_params
print(f"Best Params:{best_model_optuna}")

Best Params:{'n_estimators': 218, 'max_depth': 8, 'learning_rate': 0.024393795806414396, 'subsample': 0.7101454952537408, 'colsample_bytree': 0.7988098844323009, 'gamma': 0.2627028781812354, 'reg_alpha': 1.5456837930390957e-06, 'reg_lambda': 0.0014544300859966855, 'min_child_weight': 1, 'scale_pos_weight': 8.412022531679098}


#### **Run the Pipeline with Best Parameters**

In [173]:
best_pipeline=Pipeline(steps=[
        ("Preprocessing",Transfomers),
        ("Resampling SMOTE",SMOTE(random_state=42)),
        ("Random Forest Classifier",XGBClassifier(random_state=42,**best_model_optuna))
    ])

#### **Train the Model with Best Parameters**

In [174]:
best_pipeline.fit(X_train,Y_train)

Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


#### **Evaluate the Model**

In [175]:
Y_pred_optuna=best_pipeline.predict(X_test) ##get the prediction values

In [176]:
report_xgb_optuna=classification_report(Y_test,Y_pred_optuna) ##get the classification report for predictions

In [177]:
Y_pred_train_optuna=best_pipeline.predict(X_train) ##get the preidction value for training set

In [178]:
report_xgb_train_optuna=classification_report(Y_train,Y_pred_train_optuna) ##get the classification report for training 

#### **Get the Evaluation Metrics for Testing Data**

In [179]:
f1_test=f1_score(Y_test,Y_pred_optuna,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred_optuna,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred_optuna) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred_optuna,average="weighted") ## recall score for testing data

In [180]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score:{recall_test:.4f}")

Accuracy:  0.7967
Precision: 0.7790
F1 Score:  0.7843
Recall Score:0.7967


#### **Get the Evaluation Metrics for Training Data**

In [181]:
f1_train=f1_score(Y_train,Y_pred_train_optuna,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train_optuna,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train_optuna) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train_optuna,average="weighted") ##recall score for training data

In [182]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8657
Precision: 0.8642
F1 Score:  0.8589
Recall Score:0.8657


## **Model Deployment using Mlflow**

In [183]:
import mlflow

In [184]:
import mlflow.sklearn
import mlflow.sklearn


mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("XGB--Classifier")
#####Random Forest with Grid Search CV
with mlflow.start_run(run_name="XGB Classifier with Grid SearchCV"):
    mlflow.log_params(best_parameters)
    metrics_1={
        "accuracy":accuracy_score(Y_test,Y_pred_cv),
        "f1-score":f1_score(Y_test,Y_pred_cv,average='weighted'),
        "precision":precision_score(Y_test,Y_pred_cv,average="weighted"),
        "recall":recall_score(Y_test,Y_pred_cv,average="weighted")
    }

    for key,value in metrics_1.items():
        mlflow.log_metric(key,value)
    mlflow.sklearn.log_model(best_model,"XGB Classifier with Grid SearchCV")

##########################################################################################
### Random Forest with Optuna

with mlflow.start_run(run_name="XGB Classifier with Optuna"):
    mlflow.log_params(best_model_optuna)
    metrics_2={
        "accuracy":accuracy_score(Y_test,Y_pred_optuna),
        "f1-score":f1_score(Y_test,Y_pred_optuna,average='weighted'),
        "precision":precision_score(Y_test,Y_pred_optuna,average="weighted"),
        "recall":recall_score(Y_test,Y_pred_optuna,average="weighted")
    }
    for key,value in metrics_2.items():
        mlflow.log_metric(key,value)
    mlflow.sklearn.log_model(best_pipeline,"XGB Classifier with Optuna")
######################################################################################




🏃 View run XGB Classifier with Grid SearchCV at: http://127.0.0.1:5000/#/experiments/832579494267334109/runs/9d16d70c1a4f453bbd565e9ba3087c35
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/832579494267334109




🏃 View run XGB Classifier with Optuna at: http://127.0.0.1:5000/#/experiments/832579494267334109/runs/52662ae7ece347269da40c5e561802c4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/832579494267334109
