<a href="https://colab.research.google.com/github/kevin01157007/hypothyroid-classifier/blob/main/classifier_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from scipy.io import arff
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

# 讀取訓練與測試資料
train_data, train_meta = arff.loadarff("hypothyroid_cjlin2025_training.arff")
test_data, test_meta = arff.loadarff("hypothyroid_cjlin2025_test.arff")

print(train_data)

[(62., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b't', b'f', b'f', 0.035, 2.6, 128., 1.03, 124., b'SVI', b'negative')
 (72., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.4  , 1.4, 115., 0.97, 118., b'SVHC', b'negative')
 (40., b'F', b't', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.6  , 1.9, 142., 0.91, 156., b'other', b'negative')
 ...
 (15., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 0.04 , 1.4, 103., 0.85, 120., b'SVI', b'negative')
 (59., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b't', 1.3  , 3.2, 149., 1.17, 127., b'SVHC', b'negative')
 (65., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.7  , 2.4,  73., 0.89,  82., b'SVI', b'negative')]


In [2]:
# 轉為 DataFrame 並解碼
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
for col in train_df.select_dtypes([object]):
    train_df[col] = train_df[col].str.decode('utf-8')
for col in test_df.select_dtypes([object]):
    test_df[col] = test_df[col].str.decode('utf-8')
print(train_df.columns.tolist())
print(test_df.columns.tolist())
print(train_df['hypopituitary'].value_counts()['f'])
print(len(train_df))
print(len(test_df))

['age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'Class']
['age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'Class']
3057
3057
341


In [3]:
# 最後一欄是目標變數
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
print(y_train)

0       negative
1       negative
2       negative
3       negative
4       negative
          ...   
3052    negative
3053    negative
3054    negative
3055    negative
3056    negative
Name: Class, Length: 3057, dtype: object


In [4]:
# One-hot encoding：保持訓練與測試欄位一致
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)
extra_cols = set(X_test_encoded.columns) - set(X_train_encoded.columns)
print(extra_cols)
invalid_index = X_test_encoded[X_test_encoded[list(extra_cols)].sum(axis=1) > 0].index
X_test_encoded = X_test_encoded.drop(index=invalid_index)
print(len(X_test_encoded))
print(X_test_encoded.columns.tolist())
print(X_train_encoded.columns.tolist())
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
print(len(X_test_encoded))
print(X_test_encoded.columns.tolist())
y_test = y_test.drop(index=invalid_index)
print(len(y_test))

{'hypopituitary_t'}
340
['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'sex_F', 'sex_M', 'on thyroxine_f', 'on thyroxine_t', 'query on thyroxine_f', 'query on thyroxine_t', 'on antithyroid medication_f', 'on antithyroid medication_t', 'sick_f', 'sick_t', 'pregnant_f', 'pregnant_t', 'thyroid surgery_f', 'thyroid surgery_t', 'I131 treatment_f', 'I131 treatment_t', 'query hypothyroid_f', 'query hypothyroid_t', 'query hyperthyroid_f', 'query hyperthyroid_t', 'lithium_f', 'lithium_t', 'goitre_f', 'goitre_t', 'tumor_f', 'tumor_t', 'hypopituitary_f', 'hypopituitary_t', 'psych_f', 'psych_t', 'referral source_STMW', 'referral source_SVHC', 'referral source_SVHD', 'referral source_SVI', 'referral source_other']
['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'sex_F', 'sex_M', 'on thyroxine_f', 'on thyroxine_t', 'query on thyroxine_f', 'query on thyroxine_t', 'on antithyroid medication_f', 'on antithyroid medication_t', 'sick_f', 'sick_t', 'pregnant_f', 'pregnant_t', 'thyroid surgery_f', 'thyroid surgery_t'

In [29]:

import numpy as np
X_train_encoded = np.array(X_train_encoded)
X_test_encoded = np.array(X_test_encoded)
X_train_encoded = X_train_encoded.astype(np.float32)
X_test_encoded = X_test_encoded.astype(np.float32)
print(X_train_encoded.shape)
print(X_test_encoded.shape)

(3057, 40)
(340, 40)


In [30]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Step 1: Label encode
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Step 2: One-hot encode
y_train_onehot = to_categorical(y_train_encoded, num_classes=4)
y_test_onehot = to_categorical(y_test_encoded, num_classes=4)
print(y_train_onehot)

[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]


In [31]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

class Classifier(tf.keras.Model):
    def __init__(self):
        super(Classifier, self).__init__()
        self.net = models.Sequential([
            layers.Dense(256, input_shape=(40,)),
            layers.LeakyReLU(),
            layers.BatchNormalization(),
            layers.Dense(128),
            layers.LeakyReLU(),
            layers.BatchNormalization(),
            layers.Dense(128),
            layers.LeakyReLU(),
            layers.BatchNormalization(),
            layers.Dense(64),
            layers.LeakyReLU(),
            layers.BatchNormalization(),
            layers.Dense(4)  # No activation if this is for logits (e.g., softmax will be applied later)
        ])

    def call(self, inputs, training=False):
        return self.net(inputs, training=training)


In [32]:
def get_device():
    return 'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'

In [33]:
# get device
device = get_device()
print(f'DEVICE: {device}')

# training parameters
num_epoch = 250              # number of training epoch

# create model, define a loss function, and optimizer
model = Classifier()
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.SGD(learning_rate=0.01),
    metrics=['accuracy']
)

DEVICE: GPU


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [34]:
history = model.fit(
    X_train_encoded,
    y_train_onehot,
    epochs=num_epoch,
    batch_size=64,
    verbose=2,
    shuffle=True
)

Epoch 1/250
48/48 - 4s - 89ms/step - accuracy: 0.7399 - loss: 0.9419
Epoch 2/250
48/48 - 0s - 3ms/step - accuracy: 0.9264 - loss: 0.5040
Epoch 3/250
48/48 - 0s - 3ms/step - accuracy: 0.9509 - loss: 0.3666
Epoch 4/250
48/48 - 0s - 3ms/step - accuracy: 0.9522 - loss: 0.3002
Epoch 5/250
48/48 - 0s - 7ms/step - accuracy: 0.9614 - loss: 0.2427
Epoch 6/250
48/48 - 0s - 3ms/step - accuracy: 0.9591 - loss: 0.2186
Epoch 7/250
48/48 - 0s - 6ms/step - accuracy: 0.9614 - loss: 0.1866
Epoch 8/250
48/48 - 0s - 3ms/step - accuracy: 0.9594 - loss: 0.1765
Epoch 9/250
48/48 - 0s - 3ms/step - accuracy: 0.9607 - loss: 0.1612
Epoch 10/250
48/48 - 0s - 7ms/step - accuracy: 0.9621 - loss: 0.1525
Epoch 11/250
48/48 - 0s - 6ms/step - accuracy: 0.9591 - loss: 0.1447
Epoch 12/250
48/48 - 0s - 3ms/step - accuracy: 0.9604 - loss: 0.1283
Epoch 13/250
48/48 - 0s - 3ms/step - accuracy: 0.9647 - loss: 0.1220
Epoch 14/250
48/48 - 0s - 3ms/step - accuracy: 0.9614 - loss: 0.1284
Epoch 15/250
48/48 - 0s - 7ms/step - accur

In [35]:
test_loss, test_acc = model.evaluate(X_test_encoded, y_test_onehot, verbose=2)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')

11/11 - 1s - 103ms/step - accuracy: 0.9735 - loss: 0.0957
Test Loss: 0.0957
Test Accuracy: 0.9735
