In [44]:
import pandas as pd

# Load dataset
df = pd.read_csv('data/csv/data_milknew.csv')

print(df.describe(include='all').to_string())

                 pH   Temprature        Taste         Odor         Fat     Turbidity       Colour Grade
count   1059.000000  1059.000000  1059.000000  1059.000000  1059.000000  1059.000000  1059.000000  1059
unique          NaN          NaN          NaN          NaN          NaN          NaN          NaN     3
top             NaN          NaN          NaN          NaN          NaN          NaN          NaN   low
freq            NaN          NaN          NaN          NaN          NaN          NaN          NaN   429
mean       6.630123    44.226629     0.546742     0.432483     0.671388     0.491029   251.840415   NaN
std        1.399679    10.098364     0.498046     0.495655     0.469930     0.500156     4.307424   NaN
min        3.000000    34.000000     0.000000     0.000000     0.000000     0.000000   240.000000   NaN
25%        6.500000    38.000000     0.000000     0.000000     0.000000     0.000000   250.000000   NaN
50%        6.700000    41.000000     1.000000     0.000000     1

In [45]:
print(df.head())

    pH  Temprature  Taste  Odor  Fat   Turbidity  Colour   Grade
0  6.6          35      1     0     1          0     254    high
1  6.6          36      0     1     0          1     253    high
2  8.5          70      1     1     1          1     246     low
3  9.5          34      1     1     0          1     255     low
4  6.6          37      0     0     0          0     255  medium


In [46]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() 

print('Grade [high = 0 / low = 1 / medium = 2] :')
df["Grade"] = label_encoder.fit_transform(df["Grade"])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping = {'Grade': mapping}
print(mapping)

print("  ")

print(df.head().to_string())

# Check class balance
print('\n', df['Grade'].value_counts().to_string(), '\n')

Grade [high = 0 / low = 1 / medium = 2] :
{'Grade': {'high': np.int64(0), 'low': np.int64(1), 'medium': np.int64(2)}}
  
    pH  Temprature  Taste  Odor  Fat   Turbidity  Colour  Grade
0  6.6          35      1     0     1          0     254      0
1  6.6          36      0     1     0          1     253      0
2  8.5          70      1     1     1          1     246      1
3  9.5          34      1     1     0          1     255      1
4  6.6          37      0     0     0          0     255      2

 Grade
1    429
2    374
0    256 



In [47]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [48]:
# # # Splitting the dataset into features (input) and target (output, label)
# X = df.loc[:, df.columns != 'Grade']
# y = df['Grade']

# # # Classifier training
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [49]:
# # Prepare the data for modeling
# Split the dataset into features and target variable
X = df.drop('Grade', axis=1)
y = df['Grade']

# K-fold cross-validation with stratification
from sklearn.model_selection import StratifiedKFold

# Instantiate the StratifiedKFold object
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Split the dataset into training and testing sets
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Display the shape of the training and testing sets
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape, '\n')

X_train shape: (848, 7)
X_test shape: (211, 7)
y_train shape: (848,)
y_test shape: (211,) 



In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [51]:
# Scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# Create the hyperparameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

# Create the hyperparameter grid for K-Nearest Neighbors (KNN)
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean
}

# Create the hyperparameter grid for Support Vector Machine (SVM)
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

# Instantiate the models
logreg = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()

# Instantiate the GridSearchCV objects for each model
grid_search_lr = GridSearchCV(estimator=logreg, param_grid=param_grid_lr, cv=5, n_jobs=-1, 
                              verbose=2, scoring=scoring, refit='accuracy')

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5, n_jobs=-1, 
                               verbose=2, scoring=scoring, refit='accuracy')

grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, 
                               verbose=2, scoring=scoring, refit='accuracy')

# Fit the GridSearchCV objects
grid_search_lr.fit(X_train, y_train)
grid_search_knn.fit(X_train, y_train)
grid_search_svm.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = grid_search_lr.predict(X_test)
y_pred_knn = grid_search_knn.predict(X_test)
y_pred_svm = grid_search_svm.predict(X_test)

print("_____________________________________________________________", '\n')
# Get and display the best parameters and classification report for each model
print('Logistic Regression Best Parameters:', grid_search_lr.best_params_)
print('Logistic Regression Classification Report:\n', classification_report(y_test, y_pred_lr))
print("_____________________________________________________________", '\n')
print('KNN Best Parameters:', grid_search_knn.best_params_)
print('KNN Classification Report:\n', classification_report(y_test, y_pred_knn))
print("_____________________________________________________________", '\n')
print('SVM Best Parameters:', grid_search_svm.best_params_)
print('SVM Classification Report:\n', classification_report(y_test, y_pred_svm))
print("_____________________________________________________________", '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
_____________________________________________________________ 

Logistic Regression Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76        51
           1       0.90      0.71      0.79        86
           2       0.68      0.85      0.76        74

    accuracy                           0.77       211
   macro avg       0.78      0.78      0.77       211
weighted avg       0.79      0.77      0.77       211

_____________________________________________________________ 

KNN Best Parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
KNN Classification Report:
               precision    recall  f1-score   support

           0       1.

In [52]:
# Import the necessary library for saving models
import joblib

# Dictionary to map the models to their respective GridSearchCV objects
models = {
    'logistic_regression': grid_search_lr,
    'knn': grid_search_knn,
    'svm': grid_search_svm
}

# Loop through the models and save each best model
for model_name, grid_search in models.items():
    best_model = grid_search.best_estimator_
    joblib.dump(best_model, f'models/best_{model_name}.pkl')
    joblib.dump(mapping, 'models/mapping.pkl')
    joblib.dump(X.columns, 'models/columns.pkl')
    print(f"Saved best {model_name} model to 'models/best_{model_name}.pkl'")


Saved best logistic_regression model to 'models/best_logistic_regression.pkl'
Saved best knn model to 'models/best_knn.pkl'
Saved best svm model to 'models/best_svm.pkl'


In [53]:
mapping_jl = joblib.load('models/mapping.pkl')

# พิมพ์ค่าของ mapping
print(mapping_jl)

{'Grade': {'high': np.int64(0), 'low': np.int64(1), 'medium': np.int64(2)}}
