<a href="https://colab.research.google.com/github/kevin01157007/hypothyroid-classifier/blob/main/sklearn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from scipy.io import arff
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

# 讀取訓練與測試資料
train_data, train_meta = arff.loadarff("hypothyroid_cjlin2025_training.arff")
test_data, test_meta = arff.loadarff("hypothyroid_cjlin2025_test.arff")

print(train_data)


[(62., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b't', b'f', b'f', 0.035, 2.6, 128., 1.03, 124., b'SVI', b'negative')
 (72., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.4  , 1.4, 115., 0.97, 118., b'SVHC', b'negative')
 (40., b'F', b't', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.6  , 1.9, 142., 0.91, 156., b'other', b'negative')
 ...
 (15., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 0.04 , 1.4, 103., 0.85, 120., b'SVI', b'negative')
 (59., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b't', 1.3  , 3.2, 149., 1.17, 127., b'SVHC', b'negative')
 (65., b'F', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', b'f', 1.7  , 2.4,  73., 0.89,  82., b'SVI', b'negative')]


In [2]:
# 轉為 DataFrame 並解碼
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
for col in train_df.select_dtypes([object]):
    train_df[col] = train_df[col].str.decode('utf-8')
for col in test_df.select_dtypes([object]):
    test_df[col] = test_df[col].str.decode('utf-8')
print(train_df.columns.tolist())
print(test_df.columns.tolist())
print(train_df['hypopituitary'].value_counts()['f'])
print(len(train_df))
print(len(test_df))

['age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'Class']
['age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'Class']
3057
3057
341


In [3]:
# 最後一欄是目標變數
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
print(y_train)

0       negative
1       negative
2       negative
3       negative
4       negative
          ...   
3052    negative
3053    negative
3054    negative
3055    negative
3056    negative
Name: Class, Length: 3057, dtype: object


In [4]:
# One-hot encoding：保持訓練與測試欄位一致
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)
extra_cols = set(X_test_encoded.columns) - set(X_train_encoded.columns)
print(extra_cols)
invalid_index = X_test_encoded[X_test_encoded[list(extra_cols)].sum(axis=1) > 0].index
X_test_encoded = X_test_encoded.drop(index=invalid_index)
print(len(X_test_encoded))
print(X_test_encoded.columns.tolist())
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
print(len(X_test_encoded))
print(X_test_encoded.columns.tolist())
y_test = y_test.drop(index=invalid_index)
print(len(y_test))

{'hypopituitary_t'}
340
['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'sex_F', 'sex_M', 'on thyroxine_f', 'on thyroxine_t', 'query on thyroxine_f', 'query on thyroxine_t', 'on antithyroid medication_f', 'on antithyroid medication_t', 'sick_f', 'sick_t', 'pregnant_f', 'pregnant_t', 'thyroid surgery_f', 'thyroid surgery_t', 'I131 treatment_f', 'I131 treatment_t', 'query hypothyroid_f', 'query hypothyroid_t', 'query hyperthyroid_f', 'query hyperthyroid_t', 'lithium_f', 'lithium_t', 'goitre_f', 'goitre_t', 'tumor_f', 'tumor_t', 'hypopituitary_f', 'hypopituitary_t', 'psych_f', 'psych_t', 'referral source_STMW', 'referral source_SVHC', 'referral source_SVHD', 'referral source_SVI', 'referral source_other']
340
['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'sex_F', 'sex_M', 'on thyroxine_f', 'on thyroxine_t', 'query on thyroxine_f', 'query on thyroxine_t', 'on antithyroid medication_f', 'on antithyroid medication_t', 'sick_f', 'sick_t', 'pregnant_f', 'pregnant_t', 'thyroid surgery_f', 'thyroid surger

In [5]:
# 訓練 Naive Bayes 模型
model = MultinomialNB()
model.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9352941176470588,
                          precision    recall  f1-score     support
 compensated_hypothyroid   0.468750  0.789474  0.588235   19.000000
 negative                  0.989933  0.948553  0.968801  311.000000
 primary_hypothyroid       0.800000  0.888889  0.842105    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.935294  0.935294  0.935294    0.935294
 macro avg                 0.564671  0.656729  0.599785  340.000000
 weighted avg              0.952869  0.935294  0.941331  340.000000)

In [12]:
# 訓練 SVM 模型
model=svm.SVC(kernel='linear', C=1)
model.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9676470588235294,
                          precision    recall  f1-score     support
 compensated_hypothyroid   1.000000  0.526316  0.689655   19.000000
 negative                  0.968847  1.000000  0.984177  311.000000
 primary_hypothyroid       0.888889  0.888889  0.888889    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.967647  0.967647  0.967647    0.967647
 macro avg                 0.714434  0.603801  0.640680  340.000000
 weighted avg              0.965622  0.967647  0.962302  340.000000)

In [6]:
# 訓練 SVM 模型
model=svm.SVC(kernel='poly', degree=3, gamma='auto', C=1)
model.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9764705882352941,
                          precision    recall  f1-score     support
 compensated_hypothyroid   0.772727  0.894737  0.829268   19.000000
 negative                  0.990291  0.983923  0.987097  311.000000
 primary_hypothyroid       1.000000  1.000000  1.000000    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.976471  0.976471  0.976471    0.976471
 macro avg                 0.690755  0.719665  0.704091  340.000000
 weighted avg              0.975478  0.976471  0.975715  340.000000)

In [7]:
# 訓練 SVM 模型
model=svm.SVC(kernel='rbf', gamma=0.7, C=1)
model.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9147058823529411,
                          precision    recall  f1-score     support
 compensated_hypothyroid   0.000000  0.000000  0.000000   19.000000
 negative                  0.914706  1.000000  0.955453  311.000000
 primary_hypothyroid       0.000000  0.000000  0.000000    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.914706  0.914706  0.914706    0.914706
 macro avg                 0.228676  0.250000  0.238863  340.000000
 weighted avg              0.836687  0.914706  0.873959  340.000000)

In [8]:
from sklearn.tree import DecisionTreeClassifier

# 訓練 DecisionTreeClassifier 模型
DecisionTreeModel=DecisionTreeClassifier(criterion = 'entropy', max_depth=5, random_state=42)
DecisionTreeModel.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = DecisionTreeModel.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

(0.9911764705882353,
                          precision    recall  f1-score     support
 compensated_hypothyroid   1.000000  1.000000  1.000000   19.000000
 negative                  0.996774  0.993569  0.995169  311.000000
 primary_hypothyroid       0.900000  1.000000  0.947368    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.991176  0.991176  0.991176    0.991176
 macro avg                 0.724194  0.748392  0.735634  340.000000
 weighted avg              0.991461  0.991176  0.991247  340.000000)

In [9]:
from sklearn.tree import export_graphviz
import graphviz
dot_data = export_graphviz(DecisionTreeModel, out_file=None,
                  feature_names=['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'sex_F', 'sex_M', 'on thyroxine_f', 'on thyroxine_t', 'query on thyroxine_f', 'query on thyroxine_t', 'on antithyroid medication_f', 'on antithyroid medication_t', 'sick_f', 'sick_t', 'pregnant_f', 'pregnant_t', 'thyroid surgery_f', 'thyroid surgery_t', 'I131 treatment_f', 'I131 treatment_t', 'query hypothyroid_f', 'query hypothyroid_t', 'query hyperthyroid_f', 'query hyperthyroid_t', 'lithium_f', 'lithium_t', 'goitre_f', 'goitre_t', 'tumor_f', 'tumor_t', 'hypopituitary_f', 'psych_f', 'psych_t', 'referral source_STMW', 'referral source_SVHC', 'referral source_SVHD', 'referral source_SVI', 'referral source_other'],
                  class_names=['compensated_hypothyroid', 'negative', 'primary_hypothyroid', 'secondary_hypothyroid'],
                  filled=True, rounded=True,
                  special_characters=True)
graph = graphviz.Source(dot_data)
save_path = 'decision_tree_graph'
graph.render(save_path, format='png')

'decision_tree_graph.png'

In [22]:
from sklearn.ensemble import RandomForestClassifier
# 建立 Random Forest Classifier 模型
randomForestModel = RandomForestClassifier(n_estimators=100, criterion = 'gini')
randomForestModel.fit(X_train_encoded, y_train)

# 預測與評估
y_pred = randomForestModel.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, pd.DataFrame(report).transpose()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9882352941176471,
                          precision    recall  f1-score     support
 compensated_hypothyroid   0.950000  1.000000  0.974359   19.000000
 negative                  0.993569  0.993569  0.993569  311.000000
 primary_hypothyroid       0.888889  0.888889  0.888889    9.000000
 secondary_hypothyroid     0.000000  0.000000  0.000000    1.000000
 accuracy                  0.988235  0.988235  0.988235    0.988235
 macro avg                 0.708115  0.720615  0.714204  340.000000
 weighted avg              0.985441  0.988235  0.986802  340.000000)

In [27]:
from xgboost import XGBClassifier


# 對訓練和測試目標變數進行編碼
y_train_encoded = pd.get_dummies(y_train)
y_test_encoded = pd.get_dummies(y_test)
# 建立 XGBClassifier 模型
xgboostModel = XGBClassifier(n_estimators=200, learning_rate= 0.01)
# 使用訓練資料訓練模型
xgboostModel.fit(X_train_encoded, y_train_encoded)
# 使用訓練資料預測分類
y_pred = xgboostModel.predict(X_test_encoded)
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred, output_dict=True, zero_division=1)

accuracy, pd.DataFrame(report).transpose()

(0.9852941176470589,
               precision    recall  f1-score  support
 0              1.000000  0.894737  0.944444     19.0
 1              0.987261  0.996785  0.992000    311.0
 2              0.900000  1.000000  0.947368      9.0
 3              1.000000  0.000000  0.000000      1.0
 micro avg      0.985337  0.988235  0.986784    340.0
 macro avg      0.971815  0.722880  0.720953    340.0
 weighted avg   0.985701  0.988235  0.985243    340.0
 samples avg    0.986765  0.988235  0.987255    340.0)