# 科系分類

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers, losses, initializers, callbacks
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import pandas as pd

# 資料前處理

讀取資料集

In [None]:
dataset_path = "dataset.csv"
dataset = pd.read_csv(dataset_path)
dataset.drop(["Unnamed: 0"], axis="columns", inplace=True)
dataset.head()

隨機排序

In [None]:
data_num = dataset.shape[0]
indexes = np.random.permutation(data_num)
dataset = dataset.loc[indexes]
dataset.head()

正規化

In [None]:
dataset["group"] = pd.factorize(dataset["group"])[0]
dataset.iloc[:, 1:] = dataset.iloc[:, 1:] / 100
dataset.head()

分割成訓練、驗證、測試資料

In [None]:
train_data = dataset.iloc[:int(data_num * 0.6)]
val_data = dataset.iloc[int(data_num * 0.6):int(data_num * 0.8)]
test_data = dataset.iloc[int(data_num * 0.8):]

print(f"train_data = {train_data.shape}")
print(f"val_data = {val_data.shape}")
print(f"test_data = {test_data.shape}")

拆分成 X、Y 資料

In [None]:
X_train = train_data.iloc[:, 1:]
Y_train = train_data.iloc[:, 0]
X_val = val_data.iloc[:, 1:]
Y_val = val_data.iloc[:, 0]
X_test = test_data.iloc[:, 1:]
Y_test = test_data.iloc[:, 0]
print(f"X_train = {X_train.shape}")
print(f"Y_train = {Y_train.shape}")
print(f"X_val = {X_val.shape}")
print(f"Y_val = {Y_val.shape}")
print(f"X_test = {X_test.shape}")
print(f"Y_test = {Y_test.shape}")

# 模型訓練

模型建立

In [None]:
model = keras.Sequential([
  keras.Input(shape=(6,)),
  layers.Dense(1)
])
model.compile(
  optimizer="adam",
  loss="mse",
  metrics="acc"
)
model.summary()

In [None]:
plot_model(model, show_shapes=True)

開始訓練

In [None]:
trainHistory = model.fit(
  X_train, Y_train,
  batch_size=64,
  epochs=30,
  validation_data=(X_val, Y_val),
  callbacks=[
    callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=True, mode="auto"),
    callbacks.ModelCheckpoint("model.h5", monitor='val_loss', verbose=False, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch'),
    callbacks.TensorBoard(log_dir="logs")
  ]
)

畫出訓練結果

In [None]:
plt.plot(trainHistory.history['loss'], color='r', label='loss')
plt.plot(trainHistory.history['val_loss'], color='b', label='val_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(trainHistory.history['acc'], color='r', label='acc')
plt.plot(trainHistory.history['val_acc'], color='b', label='val_acc')
plt.legend()
plt.show()

# 測試結果

載入最佳模型

In [None]:
best_model = load_model("model.h5")

模型驗證

In [None]:
result = best_model.evaluate(X_test, Y_test)
print(f"Loss = {result[0]}, ACC = {result[1] * 100}%")

模型預測

In [None]:
predict = best_model.predict(X_test)
Y_pred = (predict >= 0.5)

畫出混淆矩陣

In [None]:
import utils
utils.plot_confusion_matrix(Y_test, Y_pred, 2)

# 實際應用

自行設計模型預測API

In [None]:
def run_model(data):
  input_data = np.array([
    data["chinese"],
    data["english"],
    data["math"],
    data["nature"],
    data["society"],
    data["physics"]
  ])
  input_data = input_data / 100
  input_data = input_data[np.newaxis, :]
  predict = model.predict(input_data)
  output_data = {
    "literature": 1 - predict[0][0],
    "science": predict[0][0]
  }
  return output_data

使用範例

In [None]:
input_data = {
  "chinese": 50,
  "english": 60,
  "math": 80,
  "nature": 50,
  "society": 50,
  "physics": 50
}

result = run_model(input_data)

print(f"{int(result['literature'] * 100)}% is Literature")
print(f"{int(result['science'] * 100)}% is Science")