## **Import Required Libraries**

In [39]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,f1_score,accuracy_score,recall_score,precision_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

## **Load the Datasets**

In [40]:
train_data=pd.read_csv("C:\\UOC pdf\\4th Year\\Machine Learning-02\\Data-Analysis-Project-2\\data\\train_data.csv")
test_data=pd.read_csv("C:\\UOC pdf\\4th Year\\Machine Learning-02\\Data-Analysis-Project-2\\data\\test_data.csv")

In [41]:
train_data=train_data.drop(columns=["Unnamed: 0"],axis=1)
test_data=test_data.drop(columns=["Unnamed: 0"],axis=1)

## **Split X_train,Y_train,X_test,Y_test**

In [42]:
X_train=train_data.drop(columns=["diagnosis"],axis=1)
Y_train=train_data["diagnosis"]
X_test=test_data.drop(columns=["diagnosis"],axis=1)
Y_test=test_data["diagnosis"]

In [43]:
print(f"Shape of the Training set:{X_train.shape,Y_train.shape}")
print(f"Shape of  the Testing set:{X_test.shape,Y_test.shape}")

Shape of the Training set:((700, 15), (700,))
Shape of  the Testing set:((300, 15), (300,))


## **Build the Pipeline**

In [44]:
num_cols=X_train.select_dtypes(include="number").columns  ##get the all numerical column names
cat_cols=X_train.select_dtypes(include=["object","category"]).columns ##get the all categorical column names

In [45]:
nominal_cols=["gender","pem_present","meditation_or_mindfulness"] ##nominal columns
ordinal_cols=["work_status","social_activity_level","exercise_frequency"] ##ordinal columns

#### **Define the Pipeline**

In [46]:
ordinal_pipeline=Pipeline(steps=[
    ("Ordinal Encoder",OrdinalEncoder())      ##Ordinal variable pipeline
])
nominal_pieline=Pipeline(steps=[
    ("Nominal Encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))  ##Nominal variable pipeline
])

#### **Combine Pipeline**

In [47]:
Transfomers=ColumnTransformer(transformers=[
    ("Ordinal Pipeline",ordinal_pipeline,ordinal_cols), ##Combine the Ordinal Pipeline with Ordinal Columns
    ("Nominal Pipeline",nominal_pieline,nominal_cols)   ##Combine the Nominal Pipeline with Nominal Columns
],
   n_jobs=-1,
   verbose=True,
   remainder="drop"
)

#### **End Pipeline**

In [48]:
final_pipeline=Pipeline(steps=[
    ("Preprocessing",Transfomers),  ##Preprocessing Step
    ("Resampling SMOTE",SMOTE(random_state=42)),  ##SMOTE Resampling Method
    ("Random Forest Classifier",RandomForestClassifier(max_depth=10,min_samples_split=10,min_samples_leaf=5,random_state=42)) ##Apply random forest classifier
])

#### **Execute Pipeline**

In [49]:
final_pipeline.fit(X_train,Y_train)  ## Execute the Final Pipeline

## **Model Evaluation**

In [50]:
Y_pred=final_pipeline.predict(X_test) ##get the prediction values

In [51]:
report_rf=classification_report(Y_test,Y_pred) ##get the classification report for predictions

In [52]:
Y_pred_train=final_pipeline.predict(X_train) ##get the preidction value for training set

In [53]:
report_rf_train=classification_report(Y_train,Y_pred_train) ##get the classification report for training

#### **Get the Evaluation Metrics for Testing Data**

In [54]:
f1_test=f1_score(Y_test,Y_pred,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred,average="weighted") ## recall score for testing data

In [55]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score: {recall_test:.4f}")

Accuracy:  0.8067
Precision: 0.7990
F1 Score:  0.8020
Recall Score: 0.8067


#### **Get the Evaluation Metrics for Training Data**

In [56]:
f1_train=f1_score(Y_train,Y_pred_train,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train,average="weighted") ##recall score for training data

In [57]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8414
Precision: 0.8380
F1 Score:  0.8390
Recall Score:0.8414


In [58]:
### print the classification report for testing data
print(report_rf)

              precision    recall  f1-score   support

        Both       0.46      0.39      0.42        54
  Depression       1.00      1.00      1.00       125
      ME/CFS       0.74      0.79      0.77       121

    accuracy                           0.81       300
   macro avg       0.73      0.73      0.73       300
weighted avg       0.80      0.81      0.80       300



In [59]:
## print the classification report for training data
print(report_rf_train)

              precision    recall  f1-score   support

        Both       0.64      0.57      0.60       148
  Depression       1.00      1.00      1.00       276
      ME/CFS       0.78      0.83      0.80       276

    accuracy                           0.84       700
   macro avg       0.81      0.80      0.80       700
weighted avg       0.84      0.84      0.84       700



## **Hyperparameter Tune using Grid  SearchCV**

In [60]:
param_grid = {
    'Random Forest Classifier__n_estimators': [100, 200, 300],
    'Random Forest Classifier__max_depth': [None,5,10,15],
    'Random Forest Classifier__min_samples_split': [2, 5, 10],
    'Random Forest Classifier__min_samples_leaf': [1, 2, 4,5],
    'Random Forest Classifier__max_features': ['sqrt', 'log2'],
    'Random Forest Classifier__bootstrap': [True, False],
    'Random Forest Classifier__criterion': ['gini', 'entropy']
}

In [61]:
grid_search=GridSearchCV(final_pipeline,param_grid=param_grid,cv=5,scoring="f1_weighted")

#### **Execute the Pipeline**

In [62]:
grid_search.fit(X_train,Y_train) ##execute the whole pipeline

#### **Best Model**

In [63]:
best_parameters=grid_search.best_params_
print(best_parameters)

{'Random Forest Classifier__bootstrap': False, 'Random Forest Classifier__criterion': 'gini', 'Random Forest Classifier__max_depth': 15, 'Random Forest Classifier__max_features': 'sqrt', 'Random Forest Classifier__min_samples_leaf': 1, 'Random Forest Classifier__min_samples_split': 2, 'Random Forest Classifier__n_estimators': 200}


In [64]:
best_model=grid_search.best_estimator_

In [65]:
best_model

## **Model Evaluation**

In [66]:
Y_pred_cv=best_model.predict(X_test) ##get the prediction values

In [67]:
report_rf_cv=classification_report(Y_test,Y_pred_cv) ##get the classification report for predictions

In [68]:
Y_pred_train_cv=best_model.predict(X_train) ##get the preidction value for training set

In [69]:
report_rf_train_cv=classification_report(Y_train,Y_pred_train_cv) ##get the classification report for training 

#### **Get the Evaluation Metrics for Testing Data**

In [70]:
f1_test=f1_score(Y_test,Y_pred_cv,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred_cv,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred_cv) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred_cv,average="weighted") ## recall score for testing data

In [71]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score: {recall_test:.4f}")

Accuracy:  0.7833
Precision: 0.7822
F1 Score:  0.7828
Recall Score: 0.7833


#### **Get the Evaluation Metrics for Training Data**

In [72]:
f1_train=f1_score(Y_train,Y_pred_train_cv,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train_cv,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train_cv) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train_cv,average="weighted") ##recall score for training data

In [73]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8971
Precision: 0.8971
F1 Score:  0.8971
Recall Score:0.8971


## **Hyperparameter Tune using Optuna**

#### **Import Required Libraries**

In [74]:
import optuna
from sklearn.model_selection import cross_val_score

#### **Define an Objective Function**

In [75]:
def objective(trials):
    ##define parameters 
    params = {
        'n_estimators': trials.suggest_int('n_estimators', 100, 1000),
        'max_depth': trials.suggest_int('max_depth', 5, 50),
        'min_samples_split': trials.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trials.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trials.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trials.suggest_categorical('bootstrap', [True, False]),
        'criterion': trials.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'class_weight': trials.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
        'n_jobs': -1,
        
    }

    ##Build the pipeline
    pipeline_optuna=Pipeline(steps=[
        ("Preprocessing",Transfomers),
        ("Resampling SMOTE",SMOTE(random_state=42)),
        ("Random Forest Classifier",RandomForestClassifier(**params,random_state=42))
    ])

    ##cross validation score
    score=cross_val_score(pipeline_optuna,X_train,Y_train,cv=5,scoring="f1_weighted")
    return score.mean()

#### **Run the Optimizer**

In [76]:
study=optuna.create_study(
    study_name="rf_model"
    ,direction="maximize")
study.optimize(objective,n_trials=50)

[I 2025-07-25 22:27:11,117] A new study created in memory with name: rf_model
[I 2025-07-25 22:27:17,037] Trial 0 finished with value: 0.7001685130287986 and parameters: {'n_estimators': 304, 'max_depth': 16, 'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': None, 'bootstrap': True, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7001685130287986.
[I 2025-07-25 22:27:18,790] Trial 1 finished with value: 0.7278693938634355 and parameters: {'n_estimators': 253, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'log_loss', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7278693938634355.
[I 2025-07-25 22:27:22,892] Trial 2 finished with value: 0.6912911739601035 and parameters: {'n_estimators': 583, 'max_depth': 27, 'min_samples_split': 11, 'min_samples_leaf': 9, 'max_features': None, 'bootstrap': False, 'criterion': 'log_loss', 'class_weight': 'balanced_subsa

#### **Best Model**

In [77]:
best_model_optuna=study.best_params
print(f"Best Params:{best_model_optuna}")

Best Params:{'n_estimators': 998, 'max_depth': 48, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': False, 'criterion': 'log_loss', 'class_weight': 'balanced'}


#### **Rebuild the Pipeline with Best Parameters**

In [78]:
best_pipeline=Pipeline(steps=[
        ("Preprocessing",Transfomers),
        ("Resampling SMOTE",SMOTE(random_state=42)),
        ("Random Forest Classifier",RandomForestClassifier(random_state=42,**best_model_optuna))
    ])

#### **Train the Model with Best Parameters**

In [79]:
best_pipeline.fit(X_train,Y_train)

#### **Evaluate the Model**

In [80]:
Y_pred_optuna=best_pipeline.predict(X_test) ##get the prediction values

In [81]:
report_rf_optuna=classification_report(Y_test,Y_pred_optuna) ##get the classification report for predictions

In [82]:
Y_pred_train_optuna=best_pipeline.predict(X_train) ##get the preidction value for training set

In [83]:
report_rf_train_optuna=classification_report(Y_train,Y_pred_train_optuna) ##get the classification report for training 

#### **Get the Evaluation Metrics for Testing Data**

In [84]:
f1_test=f1_score(Y_test,Y_pred_optuna,average='weighted') ##F1 score for testing data
precision_test=precision_score(Y_test,Y_pred_optuna,average="weighted") ##precision for testing data
accuracy_test=accuracy_score(Y_test,Y_pred_optuna) ## accuracy score for testing data
recall_test=recall_score(Y_test,Y_pred_optuna,average="weighted") ## recall score for testing data

In [85]:
# Print results
print(f"Accuracy:  {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")
print(f"Recall Score:{recall_test:.4f}")

Accuracy:  0.7667
Precision: 0.7714
F1 Score:  0.7689
Recall Score:0.7667


#### **Get the Evaluation Metrics for Training Data**

In [86]:
f1_train=f1_score(Y_train,Y_pred_train_optuna,average="weighted") ##F1 score for training data
precision_train=precision_score(Y_train,Y_pred_train_optuna,average="weighted") ## precision score for training data
accuracy_train=accuracy_score(Y_train,Y_pred_train_optuna) ## accuracy score for training data
recall_train=recall_score(Y_train,Y_pred_train_optuna,average="weighted") ##recall score for training data

In [87]:
# Print results
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")
print(f"Recall Score:{recall_train:.4f}")

Accuracy:  0.8829
Precision: 0.8856
F1 Score:  0.8838
Recall Score:0.8829


## **Model Deployement using Mlflow**

In [88]:
import mlflow

In [90]:
import mlflow.sklearn
import mlflow.sklearn


mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Random--Forest--Classifier")

##Random Forest without Hyperparamter Tune
with mlflow.start_run(run_name="Random Forest Classifier without Hyperparameter Tune"):
    params={
        "n_estimators":100,
        "criterion":"gini", 
        "max_depth":2, 
        "min_samples_leaf":1,
        "min_weight_fraction_leaf":0, 
        "max_features":"sqrt", 
        "max_leaf_nodes":None, 
        "min_impurity_decrease":0, 
        "bootstrap":True
    }
    mlflow.log_params(params)
    metrics_1={
        "accuracy":accuracy_score(Y_test,Y_pred),
        "f1-score":f1_score(Y_test,Y_pred,average='weighted'),
        "precision":precision_score(Y_test,Y_pred,average="weighted"),
        "recall":recall_score(Y_test,Y_pred,average="weighted")
    }
    for key,value in metrics_1.items():
        mlflow.log_metric(key,value)
    mlflow.sklearn.log_model(final_pipeline,"Random Forest Classifier without Hyperparameter Tune")
    ##############################################################################################
#####Random Forest with Grid Search CV

with mlflow.start_run(run_name="Random Forest Classifier with Grid SearchCV"):
    mlflow.log_params(best_parameters)
    metrics_2={
        "accuracy":accuracy_score(Y_test,Y_pred_cv),
        "f1-score":f1_score(Y_test,Y_pred_cv,average='weighted'),
        "precision":precision_score(Y_test,Y_pred_cv,average="weighted"),
        "recall":recall_score(Y_test,Y_pred_cv,average="weighted")
    }

    for key,value in metrics_2.items():
        mlflow.log_metric(key,value)
    mlflow.sklearn.log_model(best_model,"Random Forest Classifier with Grid SearchCV")
##########################################################################################
### Random Forest with Optuna

with mlflow.start_run(run_name="Random Forest Classifier with Optuna"):
    mlflow.log_params(best_parameters)
    metrics_3={
        "accuracy":accuracy_score(Y_test,Y_pred_optuna),
        "f1-score":f1_score(Y_test,Y_pred_optuna,average='weighted'),
        "precision":precision_score(Y_test,Y_pred_optuna,average="weighted"),
        "recall":recall_score(Y_test,Y_pred_optuna,average="weighted")
    }
    for key,value in metrics_3.items():
        mlflow.log_metric(key,value)
    mlflow.sklearn.log_model(best_pipeline,"Random Forest Classifier with Optuna")
######################################################################################

2025/07/25 22:35:39 INFO mlflow.tracking.fluent: Experiment with name 'Random--Forest--Classifier' does not exist. Creating a new experiment.




🏃 View run Random Forest Classifier without Hyperparameter Tune at: http://127.0.0.1:5000/#/experiments/913150392896147392/runs/8fec902f0d614120aaa2ac1dfa04fc71
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/913150392896147392




🏃 View run Random Forest Classifier with Grid SearchCV at: http://127.0.0.1:5000/#/experiments/913150392896147392/runs/361877da2e25442e97fe77aec9819500
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/913150392896147392




🏃 View run Random Forest Classifier with Optuna at: http://127.0.0.1:5000/#/experiments/913150392896147392/runs/8e96a49e9e0c4c4ca876d7b0ca9b6e58
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/913150392896147392
