In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.inspection import permutation_importance
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import json
from datetime import datetime
import numpy as np
import onnxruntime as ort

In [2]:
# โหลดชุดข้อมูล
dataset_path = "mushrooms.csv"
df = pd.read_csv(dataset_path)

In [3]:
# ตรวจสอบค่าที่หายไป (Missing Values)
print(df.isnull().sum())

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


In [4]:
# ตรวจสอบข้อมูลก่อนแปลงข้อมูล
df.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
# แปลงข้อความเป็นตัวเลข เช่น (edible: 0, poisonous: 1)
df["class"] = df["class"].map({"e": 0, "p": 1})
df["cap-shape"] = df["cap-shape"].map({"b": 0, "c": 1, "x": 2, "f": 3, "k": 4, "s": 5})
df["cap-surface"] = df["cap-surface"].map({"f": 0, "g": 1, "y": 2, "s": 3})
df["cap-color"] = df["cap-color"].map({"n": 0, "b": 1, "c": 2, "g": 3, "r": 4, "p": 5, "u": 6, "e": 7, "w": 8, "y": 9})
df["bruises"] = df["bruises"].map({"t": 1, "f": 0})
df["odor"] = df["odor"].map({"a": 0, "l": 1, "c": 2, "y": 3, "f": 4, "m": 5, "n": 6, "p": 7, "s": 8})
df["gill-attachment"] = df["gill-attachment"].map({"a": 0, "d": 1, "f": 2, "n": 3})
df["gill-spacing"] = df["gill-spacing"].map({"c": 0, "w": 1, "d": 2})
df["gill-size"] = df["gill-size"].map({"b": 0, "n": 1})
df["gill-color"] = df["gill-color"].map({"k": 0, "n": 1, "b": 2, "h": 3, "g": 4, "r": 5, "o": 6, "p": 7, "u": 8, "e": 9, "w": 10, "y": 11})
df["stalk-shape"] = df["stalk-shape"].map({"e": 0, "t": 1})
df["stalk-root"] = df["stalk-root"].map({"b": 0, "c": 1, "u": 2, "e": 3, "z": 4, "r": 5, "?": 6})
df["stalk-surface-above-ring"] = df["stalk-surface-above-ring"].map({"f": 0, "y": 1, "k": 2, "s": 3})
df["stalk-surface-below-ring"] = df["stalk-surface-below-ring"].map({"f": 0, "y": 1, "k": 2, "s": 3})
df["stalk-color-above-ring"] = df["stalk-color-above-ring"].map({"n": 0, "b": 1, "c": 2, "g": 3, "o": 4, "p": 5, "e": 6, "w": 7, "y": 8})
df["stalk-color-below-ring"] = df["stalk-color-below-ring"].map({"n": 0, "b": 1, "c": 2, "g": 3, "o": 4, "p": 5, "e": 6, "w": 7, "y": 8})
df["veil-type"] = df["veil-type"].map({"p": 0, "u": 1})
df["veil-color"] = df["veil-color"].map({"n": 0, "o": 1, "w": 2, "y": 3})
df["ring-number"] = df["ring-number"].map({"n": 0, "o": 1, "t": 2})
df["ring-type"] = df["ring-type"].map({"c": 0, "e": 1, "f": 2, "l": 3, "n": 4, "p": 5, "s": 6, "z": 7})
df["spore-print-color"] = df["spore-print-color"].map({"k": 0, "n": 1, "b": 2, "h": 3, "r": 4, "o": 5, "u": 6, "w": 7, "y": 8})
df["population"] = df["population"].map({"a": 0, "c": 1, "n": 2, "s": 3, "v": 4, "y": 5})
df["habitat"] = df["habitat"].map({"g": 0, "l": 1, "m": 2, "p": 3, "u": 4, "w": 5, "d": 6})

In [6]:
# ตรวจสอบว่าการแยกข้อมูลถูกต้อง
df.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,2,3,0,1,7,2,0,1,0,...,3,7,7,0,2,1,5,0,3,4
1,0,2,3,9,1,0,2,0,0,0,...,3,7,7,0,2,1,5,1,2,0
2,0,0,3,8,1,1,2,0,0,1,...,3,7,7,0,2,1,5,1,2,2
3,1,2,2,8,1,7,2,0,1,1,...,3,7,7,0,2,1,5,0,3,4
4,0,2,3,3,0,6,2,1,0,0,...,3,7,7,0,2,1,1,1,0,0


In [7]:
# แยกคุณสมบัติ (Features) และเป้าหมาย (Target)
X = df.drop(columns=["class"], axis=1)  # คุณสมบัติ (Features)
y = df["class"]  # เป้าหมาย (Target)

# แบ่งชุดข้อมูลเป็นชุดฝึก (Training) และชุดทดสอบ (Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

param_grid_lr = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
param_grid_knn = {'n_neighbors': [3, 5, 7, 10]}
param_grid_dt = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10]}
param_grid_gb = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [50, 100, 200]}
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
param_grid_ab = {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 1]}

grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5)
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5)
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5)
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_search_ab = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid_ab, cv=5)

grid_search_lr.fit(X_train, y_train)
grid_search_knn.fit(X_train, y_train)
grid_search_dt.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_ab.fit(X_train, y_train)

print(f'Best parameters for Logistic Regression: {grid_search_lr.best_params_}')
print(f'Best parameters for kNN: {grid_search_knn.best_params_}')
print(f'Best parameters for Decision Tree: {grid_search_dt.best_params_}')
print(f'Best parameters for Gradient Boosting: {grid_search_gb.best_params_}')
print(f'Best parameters for Random Forest: {grid_search_rf.best_params_}')
print(f'Best parameters for AdaBoost: {grid_search_ab.best_params_}')

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for kNN: {'n_neighbors': 3}
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 2}
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'n_estimators': 100}
Best parameters for Random Forest: {'max_depth': None, 'n_estimators': 50}
Best parameters for AdaBoost: {'learning_rate': 1, 'n_estimators': 200}


In [9]:
# สร้างและกำหนดโมเดลต่าง ๆ
lr = LogisticRegression(max_iter=20000)  # โมเดลโลจิสติกรีเกรสชัน
knn = KNeighborsClassifier(n_neighbors=5)  # โมเดล K-Nearest Neighbors (KNN)
dt = DecisionTreeClassifier(random_state=42)  # โมเดลต้นไม้ตัดสินใจ (Decision Tree)
gb = GradientBoostingClassifier(random_state=42)  # โมเดล Gradient Boosting
rf = RandomForestClassifier(random_state=42)  # โมเดล Random Forest
ab = AdaBoostClassifier(random_state=42)  # โมเดล AdaBoost

models = {
    "Logistic Regression": lr,
    "KNN": knn,
    "Decision Tree": dt,
    "Gradient Boosting": gb,
    "Random Forest": rf,
    "AdaBoost": ab,
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train โมเดลแต่ละตัว
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.6f}")

Logistic Regression Accuracy: 0.953231
KNN Accuracy: 1.000000
Decision Tree Accuracy: 1.000000
Gradient Boosting Accuracy: 1.000000
Random Forest Accuracy: 1.000000
AdaBoost Accuracy: 0.996923


In [10]:
# สร้างโมเดลชุดรวม (Ensemble) โดยใช้การลงคะแนนแบบ Hard Voting
voting_clf = VotingClassifier(
    estimators=[
        ("lr", lr),  # โมเดลโลจิสติกรีเกรสชัน
        ("knn", knn),  # โมเดล K-Nearest Neighbors (KNN)
        ("dt", dt),  # โมเดลต้นไม้ตัดสินใจ
        ("gb", gb),  # โมเดล Gradient Boosting
        ("rf", rf),  # โมเดล Random Forest
        ("ab", ab),  # โมเดล AdaBoost
    ],
    voting="hard",  # ใช้การลงคะแนนแบบ Hard Voting
    flatten_transform=False
)

# ฝึกโมเดลชุดรวม
voting_clf.fit(X_train, y_train)

# คำนวณ Permutation Importance เพื่อวิเคราะห์ผลกระทบของแต่ละฟีเจอร์
perm_importance = permutation_importance(voting_clf, X_test, y_test, scoring="accuracy")

# สร้าง DataFrame สำหรับค่าความสำคัญของฟีเจอร์
feature_importance = pd.Series(perm_importance.importances_mean, index=X_test.columns)
print(feature_importance)

cap-shape                   0.000000
cap-surface                 0.001354
cap-color                   0.000000
bruises                     0.000000
odor                        0.019938
gill-attachment             0.000000
gill-spacing                0.005292
gill-size                   0.007508
gill-color                  0.000000
stalk-shape                 0.000000
stalk-root                  0.005538
stalk-surface-above-ring    0.000000
stalk-surface-below-ring    0.000246
stalk-color-above-ring      0.000000
stalk-color-below-ring      0.000000
veil-type                   0.000000
veil-color                  0.000000
ring-number                 0.000000
ring-type                   0.000000
spore-print-color           0.033108
population                  0.000123
habitat                     0.000246
dtype: float64


In [11]:
# กำหนด Threshold สำหรับตัดฟีเจอร์ที่ไม่สำคัญ
threshold = 0.0001  # สามารถปรับค่าได้ (ค่าต่ำกว่านี้จะถูกลบ)
low_importance_features = feature_importance[feature_importance < threshold].index.tolist()

# ลบฟีเจอร์ที่มีความสำคัญต่ำ
print(f"\nRemoving {len(low_importance_features)} low-importance features: {low_importance_features}")
X_train = X_train.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)

# แสดงฟีเจอร์ที่เหลืออยู่
print(f"\nRemaining {len(X_train.columns.tolist())} high-importance features: {X_train.columns.tolist()}")

# ฝึกโมเดลใหม่หลังจากลบฟีเจอร์ที่ไม่สำคัญ
voting_clf.fit(X_train, y_train)

# ทำนายผลลัพธ์
predictions = voting_clf.predict(X_test)

# ประเมินผลโมเดล
accuracy = accuracy_score(y_test, predictions)  # คำนวณความแม่นยำ
print(f"\nความแม่นยำของโมเดล: {accuracy}")

# แสดงผลรายงานการจำแนกประเภท
print("รายงานการจำแนกประเภทของโมเดล:")
print(classification_report(y_test, predictions))


Removing 13 low-importance features: ['cap-shape', 'cap-color', 'bruises', 'gill-attachment', 'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type']

Remaining 9 high-importance features: ['cap-surface', 'odor', 'gill-spacing', 'gill-size', 'stalk-root', 'stalk-surface-below-ring', 'spore-print-color', 'population', 'habitat']

ความแม่นยำของโมเดล: 1.0
รายงานการจำแนกประเภทของโมเดล:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [12]:
# แปลงโมเดลที่ฝึกแล้วเป็นรูปแบบ ONNX
onnx_model = convert_sklearn(voting_clf, initial_types=[("input", FloatTensorType([None, X_train.shape[1]]))])

# บันทึกโมเดล ONNX
onnx_model_path = "ensemble_model.onnx"
onnx.save_model(onnx_model, onnx_model_path)  # บันทึกโมเดล ONNX
print("โมเดล ONNX บันทึกเรียบร้อยแล้ว.")

โมเดล ONNX บันทึกเรียบร้อยแล้ว.


In [13]:
# เตรียมข้อมูลอินพุต JSON
input_data = {
    "cap-surface": 2,  # scaly
    "odor": 5,  # musty
    "gill-spacing": 0,  # close
    "gill-size": 1,  # narrow
    "stalk-root": 1,  # club
    "stalk-surface-below-ring": 3,  # smooth
    "spore-print-color": 7,  # white
    "population": 2,  # numerous
    "habitat": 5  # waste
}

# แปลง JSON เป็นอาร์เรย์ NumPy
input_array = np.array([list(input_data.values())], dtype=np.float32)

In [14]:
# ทดสอบโมเดล ONNX
ort_session = ort.InferenceSession(onnx_model_path)
inputs = {ort_session.get_inputs()[0].name: input_array.reshape(1, -1)}
prediction = ort_session.run(None, inputs)

if prediction[0] == 0:
    print("สามารถกินได้")
elif prediction[0] == 1:
    print("ไม่สามารถกินได้ เพราะมีพิษ!")

ไม่สามารถกินได้ เพราะมีพิษ!
