In [1]:
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight


In [2]:
df = pd.read_csv('data/cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,TSH,T3,TT4,T4U,FTI,target
0,63,Female,68.0,0.0,48.0,1.02,47.0,hypothyroid
1,36,Female,1.5,2.4,90.0,1.06,85.0,negative
2,40,Female,1.2,2.3,104.0,1.08,96.0,negative
3,40,Female,5.9,2.1,88.0,0.84,105.0,negative
4,77,Female,0.05,2.4,107.0,1.13,95.0,negative


In [4]:
df.columns.tolist()

['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'target']

In [5]:
features_to_be_scaled = ['age','TSH','T3','TT4','T4U','FTI']
features_to_be_encoded = ['sex']

In [6]:
num_pipeline = Pipeline(steps=[("scaler",StandardScaler())])
cat_pipeline = Pipeline(steps=[("one_hot_encoder",OneHotEncoder()),("scaler",StandardScaler(with_mean=False))])
preprocessor = ColumnTransformer([
                ("num_pipeline",num_pipeline,features_to_be_scaled),
                ("cat_pipeline",cat_pipeline,features_to_be_encoded)
                ])
# cat_pipeline = Pipeline(steps=[("label_encoder", LabelEncoder()), ("scaler", StandardScaler(with_mean=False))])


In [7]:
le= LabelEncoder()
df['target'] = le.fit_transform(df['target'])

In [8]:
X = df.drop(['target'],axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)
# train_set, test_set=train_test_split(df,test_size=0.2,random_state=42,stratify=y)

In [9]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# le = LabelEncoder()
# df[final_cat_cols] = df[features_to_be_encoded].apply(le.fit_transform)
# scaler = StandardScaler()
# df[final_num_cols] = scaler.fit_transform(df[final_num_cols])
# df = pd.concat([df['age'], df[final_cat_cols], df[final_num_cols]], axis=1)

In [11]:
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train)
xgb=XGBClassifier(objective='multi:softmax', 
                            num_class=3, 
                            missing=1,
                            gamma=0, # default gamma value
                            learning_rate=0.1,
                            max_depth=5, # re-optimized from v2
                            reg_lambda=1, # default L2 value
                            #subsample=0.8, # tried but not ideal
                            #colsample_bytree=0.3, # tried but not ideal
                            early_stopping_rounds=10,
                            eval_metric=['merror','mlogloss'],
                            seed=42)

xgb.fit(X_train, 
            y_train,
            verbose=0, # set to 1 to see xgb training round intermediate results
            sample_weight=sample_weights, # class weights to combat unbalanced 'target'
            eval_set=[(X_train, y_train), (X_test, y_test)])

results = xgb.evals_result()
results

{'validation_0': OrderedDict([('merror',
               [0.01651794190241124,
                0.01765711030947409,
                0.01613821910005696,
                0.01613821910005696,
                0.01651794190241124,
                0.01670780330358838,
                0.01651794190241124,
                0.01575849629770268,
                0.0153787734953484,
                0.0153787734953484,
                0.01461932789063983,
                0.01442946648946269,
                0.01385988228593127,
                0.01385988228593127,
                0.01385988228593127,
                0.01385988228593127,
                0.01385988228593127,
                0.01367002088475413,
                0.01348015948357699,
                0.01348015948357699,
                0.01348015948357699,
                0.01348015948357699,
                0.01310043668122271,
                0.01310043668122271,
                0.01348015948357699,
                0.01329029808239985,

In [12]:
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.9908883826879271

In [18]:
#random forest 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def objective(trial):
    # Define parameters to tune
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        # "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }
    
    # Initialize the Random Forest Classifier with the parameters
    rf_model = RandomForestClassifier(**params)
    rf_model.fit(X_train, y_train)
    accuracy = rf_model.score(X_train, y_train)

    return accuracy

In [19]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50)

[I 2024-08-08 01:00:50,478] A new study created in memory with name: no-name-6c253aff-4738-4290-b8d0-a13da6025a2d
[I 2024-08-08 01:00:52,142] Trial 0 finished with value: 0.9863299791152459 and parameters: {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 14, 'min_samples_leaf': 4, 'bootstrap': True}. Best is trial 0 with value: 0.9863299791152459.
[I 2024-08-08 01:00:59,112] Trial 1 finished with value: 0.9827226124928802 and parameters: {'n_estimators': 900, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 7, 'bootstrap': False}. Best is trial 0 with value: 0.9863299791152459.
[I 2024-08-08 01:01:04,428] Trial 2 finished with value: 0.985190810708183 and parameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9863299791152459.
[I 2024-08-08 01:01:05,438] Trial 3 finished with value: 0.9794949686728688 and parameters: {'n_estimators': 200, 'max_depth': 3, 'min_samples_split'

In [20]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 50
Best trial:
  Value: 1.0
  Params: 
    n_estimators: 800
    max_depth: 14
    min_samples_split: 2
    min_samples_leaf: 1
    bootstrap: True


In [23]:
# rf_model = RandomForestClassifier(n_estimators= 800,max_depth= 14,min_samples_split= 2,min_samples_leaf= 1,bootstrap= False)
# rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test,y_pred)

0.9931662870159453

In [24]:
y_pred = le.inverse_transform(y_pred)

In [25]:
y_pred

array(['negative', 'negative', 'negative', ..., 'hypothyroid', 'negative',
       'negative'], dtype=object)

In [29]:
#predicting a sample instance 
new_instance = {
    'age': 45,
    'sex': 'Male',
    'TSH': 2.5,
    'T3': 1.0,
    'TT4': 100.0,
    'T4U': 1.1,
    'FTI': 120.0
}

new_data = pd.DataFrame([new_instance])
preprocessed_data = preprocessor.transform(new_data)
predicted_target = rf_model.predict(preprocessed_data)
decoded_target = le.inverse_transform(predicted_target)

In [30]:
decoded_target[0]

'negative'