# Import Libraries

In [7]:
! pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting pandas>=0.24.2 (from mlxtend)
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting matplotlib>=3.0.0 (from mlxtend)
  Downloading matplotlib-3.10.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib>=3.0.0->mlxtend)
  Downloading contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib>=3.0.0->mlxtend)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib>=3.0.0->mlxtend)
  Downloading fonttools-4.57.0-cp313-cp313-win_amd64.whl.metadata (104 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib>=3.0.0->mlxtend)
  Downloading kiwisolver-1.4.8-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib>=3.0.0->mlxtend)
  Downloading pillow-11.2.1-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pandas as pd
import logging

In [9]:
data=pd.read_csv('Preprocessed_data.csv')

#Dividing target variable from the main dataset

X = data.iloc[: , 0:-1]
Y = data.iloc[: , -1] 

In [10]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

# Encoding categorical data
## Encoding the Independent Variable

In [11]:
ordinal_encoder = OrdinalEncoder()
X_train_cat_encoded = pd.DataFrame(ordinal_encoder.fit_transform(X_train.select_dtypes(exclude='number')))
X_train_cat_encoded.columns = X_train.select_dtypes(exclude='number').columns

X_test_cat_encoded = pd.DataFrame(ordinal_encoder.transform(X_test.select_dtypes(exclude='number')))
X_test_cat_encoded.columns = X_test.select_dtypes(exclude='number').columns

## Encoding the Independent Variable

In [12]:
label_encoder = LabelEncoder()
Y_train_cat_encoded= pd.DataFrame(label_encoder.fit_transform(Y_train))
Y_test_cat_encoded = pd.DataFrame(label_encoder.transform(Y_test))

## Standardization

In [13]:
sc = StandardScaler()
X_train_sc=pd.DataFrame(sc.fit_transform(X_train.select_dtypes(exclude='O')))
X_test_sc=pd.DataFrame(sc.transform(X_test.select_dtypes(exclude='O')))

X_train_sc.columns=X_train.select_dtypes(exclude='O').columns
X_test_sc.columns=X_test.select_dtypes(exclude='O').columns

## Combining data

In [14]:
X_train_final=pd.concat([X_train_sc,X_train_cat_encoded],axis=1)
X_test_final=pd.concat([X_test_sc,X_test_cat_encoded],axis=1)

## Handling imbalanced Dataset
### Since the dataset is small, will use over-sampling: SMOTE technique to balance the data

In [15]:
X_train_resample,Y_train_resample=SMOTE(random_state=0,k_neighbors=1).fit_resample(X_train_final,Y_train_cat_encoded)
X_test_resample,Y_test_resample=SMOTE(random_state=0,k_neighbors=1).fit_resample(X_test_final,Y_test_cat_encoded)

X_train_resample.shape,X_test_resample.shape,Y_train_resample.shape,Y_test_resample.shape

((7852, 28), (1476, 28), (7852, 1), (1476, 1))

## Feature Selection

In [16]:
print('Training dataset shape:', X_train_resample.shape, Y_train_resample.shape)
print('Testing dataset shape:', X_test_resample.shape, Y_test_resample.shape)

Y_train_resample_flat = Y_train_resample.to_numpy().ravel()
Y_test_resample_flat = Y_test_resample.to_numpy().ravel()

print('Training dataset shape:', X_train_resample.shape, Y_train_resample_flat.shape)
print('Testing dataset shape:', X_test_resample.shape, Y_test_resample_flat.shape)

Training dataset shape: (7852, 28) (7852, 1)
Testing dataset shape: (1476, 28) (1476, 1)
Training dataset shape: (7852, 28) (7852,)
Testing dataset shape: (1476, 28) (1476,)


### Forward selection approach

In [17]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
forward_fs = sfs(rf , k_features=10,forward=True,floating=False,verbose=2,scoring='accuracy',cv=5)
forward_fs = forward_fs.fit(X_train_resample, Y_train_resample_flat)


[2025-04-28 16:09:14] Features: 1/10 -- score: 0.7949548950524434
[2025-04-28 16:10:07] Features: 2/10 -- score: 0.9687971067963528
[2025-04-28 16:10:55] Features: 3/10 -- score: 0.9859912344362591
[2025-04-28 16:11:57] Features: 4/10 -- score: 0.9949055938243726
[2025-04-28 16:13:00] Features: 5/10 -- score: 0.996943323859605
[2025-04-28 16:13:56] Features: 6/10 -- score: 0.9978349625172818
[2025-04-28 16:14:44] Features: 7/10 -- score: 0.9979621888772211
[2025-04-28 16:15:26] Features: 8/10 -- score: 0.998089577412253
[2025-04-28 16:16:17] Features: 9/10 -- score: 0.99770757398225
[2025-04-28 16:17:03] Features: 10/10 -- score: 0.998089577412253

In [18]:
# Create the Handler for logging records/messages to a file
file_handler = logging.FileHandler("log_file.log")

In [19]:
#set the format of the log records and the logging level to DEBUG
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG)

In [20]:
# a function  to create and save logs in the log files
def log(path, file):
    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [obj] -- [logger that record logs]
    """

    # check if the file exist
    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(levelname)s %(message)s"
    file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    # configure logger
    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    # create a file handler for output file
    handler = logging.FileHandler(log_file)

    # set the logging level for log file
    handler.setLevel(logging.INFO)
    
    # create a logging format
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)

    # add the handlers to the logger
    logger.addHandler(handler)

    return logger

In [21]:
import os

In [22]:
logger = log(path=".",file="log_file.log")

In [23]:
logger.info("Feature Selection")

2025-04-28 16:18:18,080 INFO:Feature Selection


In [24]:
feat_names = list(forward_fs.k_feature_names_)
logger.info("Features {}".format(feat_names))
X_train_new=X_train_resample[['age','sex','TSH', 'TT4', 'FTI', 'on_thyroxine', 'on_antithyroid_medication', 'goitre', 'hypopituitary', 'psych', 'T3_measured']]
X_test_new=X_test_resample[['age','sex','TSH', 'TT4', 'FTI', 'on_thyroxine', 'on_antithyroid_medication', 'goitre', 'hypopituitary', 'psych', 'T3_measured']]

2025-04-28 16:18:19,294 INFO:Features ['age', 'TSH', 'TT4', 'FTI', 'on_thyroxine', 'query_on_thyroxine', 'hypopituitary', 'TSH_measured', 'T3_measured', 'TBG_measured']


In [25]:
#Fitting the Random Forest model
rf_model=rf.fit(X_train_new,Y_train_resample_flat)

logger.info("Traininig Random Forest Model")

#Checking the metrics of Random Forest
def print_Score(clf,x_train,x_test,y_train,y_test,train=True):
    if train:
        pred=clf.predict(x_train)
        clf_report=pd.DataFrame(classification_report(y_train,pred,output_dict=True))
        print("Train Result:\n===============")
        print(f"Accuracy Score:{accuracy_score(y_train,pred)*100:.2f}%")
        print("---------------------------------")
        print(f"Classification Report:\n{clf_report}")
        print("-----------------------------------")
        print(f"Confusion Matrix:\n{confusion_matrix(y_train,pred)}\n")
        logger.info("Train Result:\n==============="+"\n"+
                    f"Accuracy Score:{accuracy_score(y_train,pred)*100:.2f}%""\n"+
        "---------------------------------"+"\n"+
        f"Classification Report:\n{clf_report}"+"\n"+
        "-----------------------------------"+"\n"+
        f"Confusion Matrix:\n{confusion_matrix(y_train,pred)}\n")
    elif train==False:
        pred=clf.predict(x_test)
        clf_report=pd.DataFrame(classification_report(y_test,pred,output_dict=True))
        print("Test Result:\n===============")
        print(f"Accuracy Score:{accuracy_score(y_test,pred)*100:.2f}%")
        print("---------------------------------")
        print(f"Classification Report:\n{clf_report}")
        print("---------------------------------")
        print(f"Confusion Matrix:\n{confusion_matrix(y_test,pred)}\n")
        
        logger.info("Test Result:\n==============="+"\n"+
        f"Accuracy Score:{accuracy_score(y_test,pred)*100:.2f}%"+"\n"+
        "---------------------------------"+"\n"+
        f"Classification Report:\n{clf_report}"+"\n"+
        "---------------------------------"+"\n"+
        f"Confusion Matrix:\n{confusion_matrix(y_test,pred)}\n")
        
print_Score(rf_model,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=True)
print_Score(rf_model,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=False)

2025-04-28 16:18:21,650 INFO:Traininig Random Forest Model
2025-04-28 16:18:21,876 INFO:Train Result:
Accuracy Score:99.83%
---------------------------------
Classification Report:
                     0            1            2       3  accuracy  \
precision     0.995434     1.000000     0.997966     1.0  0.998344   
recall        0.999491     0.993887     1.000000     1.0  0.998344   
f1-score      0.997458     0.996934     0.998982     1.0  0.998344   
support    1963.000000  1963.000000  1963.000000  1963.0  0.998344   

             macro avg  weighted avg  
precision     0.998350      0.998350  
recall        0.998344      0.998344  
f1-score      0.998344      0.998344  
support    7852.000000   7852.000000  
-----------------------------------
Confusion Matrix:
[[1962    0    1    0]
 [   9 1951    3    0]
 [   0    0 1963    0]
 [   0    0    0 1963]]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize(

Train Result:
Accuracy Score:99.83%
---------------------------------
Classification Report:
                     0            1            2       3  accuracy  \
precision     0.995434     1.000000     0.997966     1.0  0.998344   
recall        0.999491     0.993887     1.000000     1.0  0.998344   
f1-score      0.997458     0.996934     0.998982     1.0  0.998344   
support    1963.000000  1963.000000  1963.000000  1963.0  0.998344   

             macro avg  weighted avg  
precision     0.998350      0.998350  
recall        0.998344      0.998344  
f1-score      0.998344      0.998344  
support    7852.000000   7852.000000  
-----------------------------------
Confusion Matrix:
[[1962    0    1    0]
 [   9 1951    3    0]
 [   0    0 1963    0]
 [   0    0    0 1963]]

Test Result:
Accuracy Score:97.83%
---------------------------------
Classification Report:
                    0           1           2    3  accuracy    macro avg  \
precision    0.940727    1.000000    1.00000

In [26]:
##Hyper parameter tuning
RF=RandomForestClassifier()
model=RF.fit(X_train_new,Y_train_resample_flat)

print_Score(model,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=True)
print_Score(model,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=False)

2025-04-28 16:18:23,856 INFO:Train Result:
Accuracy Score:99.99%
---------------------------------
Classification Report:
                     0            1       2       3  accuracy    macro avg  \
precision     0.999491     1.000000     1.0     1.0  0.999873     0.999873   
recall        1.000000     0.999491     1.0     1.0  0.999873     0.999873   
f1-score      0.999745     0.999745     1.0     1.0  0.999873     0.999873   
support    1963.000000  1963.000000  1963.0  1963.0  0.999873  7852.000000   

           weighted avg  
precision      0.999873  
recall         0.999873  
f1-score       0.999873  
support     7852.000000  
-----------------------------------
Confusion Matrix:
[[1963    0    0    0]
 [   1 1962    0    0]
 [   0    0 1963    0]
 [   0    0    0 1963]]

2025-04-28 16:18:23,898 INFO:Test Result:
Accuracy Score:98.10%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.9

Train Result:
Accuracy Score:99.99%
---------------------------------
Classification Report:
                     0            1       2       3  accuracy    macro avg  \
precision     0.999491     1.000000     1.0     1.0  0.999873     0.999873   
recall        1.000000     0.999491     1.0     1.0  0.999873     0.999873   
f1-score      0.999745     0.999745     1.0     1.0  0.999873     0.999873   
support    1963.000000  1963.000000  1963.0  1963.0  0.999873  7852.000000   

           weighted avg  
precision      0.999873  
recall         0.999873  
f1-score       0.999873  
support     7852.000000  
-----------------------------------
Confusion Matrix:
[[1963    0    0    0]
 [   1 1962    0    0]
 [   0    0 1963    0]
 [   0    0    0 1963]]

Test Result:
Accuracy Score:98.10%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.946154    1.000000    1.000000   0.98103     0.982051   
re

In [27]:
import numpy as np

In [28]:
## Randomized Search CV

#No of trees in Random Forest
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
#No of features consider at every split
max_features=['auto','sqrt','log2']
#maximum no of levels in trees
max_depth=[int(x) for x in np.linspace(10,1000,10)]
#minimum no of samples required to split a node
min_samples_split=[1,3,4,5,7,9]
#minimum samples leafs required at each leaf node
min_sample_leafs=[1,2,4,6,8]

#create random gird
random_grid={'n_estimators':n_estimators,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split,
'min_samples_leaf':min_sample_leafs,
'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [29]:
rcv=RandomizedSearchCV(estimator=RF,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=0,n_jobs=-1)

rcv.fit(X_train_new,Y_train_resample_flat)

logger.info("Use Randomized Search CV")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


111 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
52 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\bhara\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\bhara\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\bhara\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_param

In [30]:
rcv.best_estimator_
logger.info(rcv.best_estimator_)

2025-04-28 16:25:48,266 INFO:RandomForestClassifier(max_depth=230, min_samples_split=4, n_estimators=600)


In [31]:
best_random_grid=rcv.best_estimator_
logger.info("Result with best estimetors")
print_Score(best_random_grid,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=False)

2025-04-28 16:25:55,464 INFO:Result with best estimetors
2025-04-28 16:25:55,571 INFO:Test Result:
Accuracy Score:98.10%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.946154    1.000000    1.000000   0.98103     0.982051   
recall       1.000000    0.995935    0.947154   0.98103     0.981030   
f1-score     0.972332    0.997963    0.972860   0.98103     0.981052   
support    492.000000  492.000000  492.000000   0.98103  1476.000000   

           weighted avg  
precision      0.982051  
recall         0.981030  
f1-score       0.981052  
support     1476.000000  
---------------------------------
Confusion Matrix:
[[492   0   0]
 [  2 490   0]
 [ 26   0 466]]



Test Result:
Accuracy Score:98.10%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.946154    1.000000    1.000000   0.98103     0.982051   
recall       1.000000    0.995935    0.947154   0.98103     0.981030   
f1-score     0.972332    0.997963    0.972860   0.98103     0.981052   
support    492.000000  492.000000  492.000000   0.98103  1476.000000   

           weighted avg  
precision      0.982051  
recall         0.981030  
f1-score       0.981052  
support     1476.000000  
---------------------------------
Confusion Matrix:
[[492   0   0]
 [  2 490   0]
 [ 26   0 466]]



In [32]:
param_grid = {
    'criterion': [rcv.best_params_['criterion']],
    'max_depth': [rcv.best_params_['max_depth']],
    'max_features': [rcv.best_params_['max_features']],
    'min_samples_leaf': [rcv.best_params_['min_samples_leaf'], 
                         rcv.best_params_['min_samples_leaf']+2, 
                         rcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rcv.best_params_['min_samples_split'] - 2,
                          rcv.best_params_['min_samples_split'] - 1,
                          rcv.best_params_['min_samples_split'], 
                          rcv.best_params_['min_samples_split'] +1,
                          rcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rcv.best_params_['n_estimators'] - 200, rcv.best_params_['n_estimators'] - 100, 
                     rcv.best_params_['n_estimators'], 
                     rcv.best_params_['n_estimators'] + 100, rcv.best_params_['n_estimators'] + 200]
}

In [33]:
print(param_grid)
logger.info(f"Parameter Grid: {param_grid}")

2025-04-28 16:25:58,153 INFO:Parameter Grid: {'criterion': ['gini'], 'max_depth': [230], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [2, 3, 4, 5, 6], 'n_estimators': [400, 500, 600, 700, 800]}


{'criterion': ['gini'], 'max_depth': [230], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [2, 3, 4, 5, 6], 'n_estimators': [400, 500, 600, 700, 800]}


In [34]:
grid_search=GridSearchCV(estimator=RF,param_grid=param_grid,cv=10,n_jobs=1,verbose=2)
grid_search.fit(X_train_new,Y_train_resample_flat)
logger.info("Grid Search CV: ")

Fitting 10 folds for each of 75 candidates, totalling 750 fits
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.5s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.5s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.7s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.7s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.7s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=40

2025-04-28 17:11:58,049 INFO:Grid Search CV: 


In [35]:
best_grid=grid_search.best_estimator_
best_grid
logger.info(f"Best Grid: {best_grid}")

2025-04-28 19:56:07,181 INFO:Best Grid: RandomForestClassifier(max_depth=230, min_samples_split=5, n_estimators=500)


In [51]:
logger.info("Result with Best Grid: ")
print_Score(best_grid,X_train_new,X_test_new,Y_train_resample_flat,Y_test_resample_flat,train=False)

2025-04-28 21:33:10,035 INFO:Result with Best Grid: 
2025-04-28 21:33:10,142 INFO:Test Result:
Accuracy Score:98.04%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.944338    1.000000    1.000000  0.980352     0.981446   
recall       1.000000    0.995935    0.945122  0.980352     0.980352   
f1-score     0.971372    0.997963    0.971787  0.980352     0.980374   
support    492.000000  492.000000  492.000000  0.980352  1476.000000   

           weighted avg  
precision      0.981446  
recall         0.980352  
f1-score       0.980374  
support     1476.000000  
---------------------------------
Confusion Matrix:
[[492   0   0]
 [  2 490   0]
 [ 27   0 465]]



Test Result:
Accuracy Score:98.04%
---------------------------------
Classification Report:
                    0           1           2  accuracy    macro avg  \
precision    0.944338    1.000000    1.000000  0.980352     0.981446   
recall       1.000000    0.995935    0.945122  0.980352     0.980352   
f1-score     0.971372    0.997963    0.971787  0.980352     0.980374   
support    492.000000  492.000000  492.000000  0.980352  1476.000000   

           weighted avg  
precision      0.981446  
recall         0.980352  
f1-score       0.980374  
support     1476.000000  
---------------------------------
Confusion Matrix:
[[492   0   0]
 [  2 490   0]
 [ 27   0 465]]



In [55]:
import pickle
import os


# Save the model inside 'src' folder
with open('Thyroid_model.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

print("✅ Model saved successfully in src/Thyroid_model.pkl!")

✅ Model saved successfully in src/Thyroid_model.pkl!
