Chức năng của File: Huấn luyện mô hình trên các tập dữ liệu giữ nguyên chiều có tỷ lệ train/validation khác nhau, không áp dụng hiệu chỉnh L2 và tính toán các chỉ số trên tập validation.

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import time
import seaborn as sns
from sklearn.decomposition import PCA
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import joblib

In [13]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(X, y), (X_test, y_test) = fashion_mnist.load_data()

X = X.reshape(X.shape[0], -1)
X = X.astype('float32') / 255.0

X_test = X_test.reshape(X_test.shape[0], -1)
X_test = X_test.astype('float32') / 255.0

In [17]:
fashion_mnist_labels = [
    "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
    "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"
]

In [19]:
def evaluate(y_true, y_pred, data_set_name):

  print(f'----------------{data_set_name}----------------')
  accuracy_logReg = accuracy_score(y_true, y_pred)
  print(f'Accuracy: {accuracy_logReg:.3f}')

  precision_logReg = precision_score(y_true, y_pred, average='weighted')
  print(f'Precision: {precision_logReg:.3f}')

  recall_logReg = recall_score(y_true, y_pred, average='weighted')
  print(f'Recall: {recall_logReg:.3f}')

  confusion_matrix_logReg = confusion_matrix(y_true, y_pred)
  print('Confusion Matrix:\n', confusion_matrix_logReg)

  print(classification_report(y_true, y_pred, target_names=fashion_mnist_labels))

def create_data(X, y, X_test, n_components=100, test_size=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    if n_components < 100:
        pca = PCA(n_components=n_components)
        X_train = pca.fit_transform(X_train)
        X_val = pca.transform(X_val)
        X_test = pca.transform(X_test)
    return X_train, X_val, y_train, y_val, X_test

def train_predict_evaluate_each_parameter(X_train, y_train, X_test, y_test, solver='lbfgs', max_iter=10000, penalty=None, tol=1e-4, verbose=0, C=1):

  model_logReg = LogisticRegression(multi_class='multinomial', solver=solver, max_iter=max_iter, penalty=penalty, tol=tol, verbose=verbose, C=C, n_jobs=-1)
  model_logReg.fit(X_train, y_train)

  y_pred_training = model_logReg.predict(X_train)
  evaluate(y_train, y_pred_training, 'Trainning set')

  y_pred_testing = model_logReg.predict(X_test)
  evaluate(y_test, y_pred_testing, 'Validation set')

***Không sử dụng hiệu chỉnh L2***

Ty le 4/1

In [21]:
# Với tỷ lệ train/validation : 4/1
X_train, X_val, y_train, y_val, X_test = create_data(X=X, y=y, X_test=X_test, n_components=100, test_size=0.2)
X_train.shape, X_val.shape, X_test.shape

((48000, 784), (12000, 784), (10000, 784))

In [23]:
# solver 'lbfgs' voi ty le 4/1
train_predict_evaluate_each_parameter(X_train, y_train, X_val, y_val, solver='lbfgs', verbose=0)



----------------Trainning set----------------
Accuracy: 0.889
Precision: 0.888
Recall: 0.889
Confusion Matrix:
 [[4106    9   66  177   21    1  382    0   38    0]
 [   2 4722    5   60    6    0    5    0    0    0]
 [  59    3 3766   35  531    0  390    0   16    0]
 [ 127   28   35 4383  115    0   97    0   15    0]
 [  21    3  372  117 3940    0  332    0   15    0]
 [   0    0    0    0    0 4666    0  118    0   16]
 [ 607    7  485  131  386    0 3129    0   55    0]
 [   0    0    0    0    0  112    0 4593    1   94]
 [  14    1   12   18    7    0   49    2 4697    0]
 [   0    0    0    0    0   25    0  123    0 4652]]
              precision    recall  f1-score   support

 T-shirt/top       0.83      0.86      0.84      4800
     Trouser       0.99      0.98      0.99      4800
    Pullover       0.79      0.78      0.79      4800
       Dress       0.89      0.91      0.90      4800
        Coat       0.79      0.82      0.80      4800
      Sandal       0.97      0.9

In [None]:
# solver 'saga' voi ty le 4/1
train_predict_evaluate_each_parameter(X_train, y_train, X_val, y_val, solver='saga', verbose=1)



convergence after 1029 epochs took 2344 seconds
----------------Trainning set----------------
Accuracy: 0.885
Precision: 0.884
Recall: 0.885
Confusion Matrix:
 [[4100   12   70  178   16    4  378    0   41    1]
 [   5 4705    5   69    6    0    8    0    2    0]
 [  66    6 3773   33  526    0  376    0   20    0]
 [ 135   27   32 4367  118    0  105    0   16    0]
 [  20    3  382  120 3926    0  337    0   12    0]
 [   0    0    0    0    0 4636    0  132    5   27]
 [ 630    6  496  136  381    0 3098    0   52    1]
 [   0    0    0    0    0  115    0 4570    4  111]
 [  17    3   16   21   11    9   54   10 4658    1]
 [   0    0    0    0    0   37    0  131    5 4627]]
              precision    recall  f1-score   support

 T-shirt/top       0.82      0.85      0.84      4800
     Trouser       0.99      0.98      0.98      4800
    Pullover       0.79      0.79      0.79      4800
       Dress       0.89      0.91      0.90      4800
        Coat       0.79      0.82     

In [None]:
# solver 'sag' voi ty le 4/1
train_predict_evaluate_each_parameter(X_train, y_train, X_val, y_val, solver='sag', verbose=1)



convergence after 946 epochs took 1639 seconds
----------------Trainning set----------------
Accuracy: 0.887
Precision: 0.886
Recall: 0.887
Confusion Matrix:
 [[4104   10   70  179   19    3  375    0   40    0]
 [   3 4716    6   62    6    0    6    0    1    0]
 [  60    7 3773   33  526    0  382    0   19    0]
 [ 133   26   32 4377  118    0  101    0   13    0]
 [  21    3  386  116 3925    0  337    0   12    0]
 [   0    0    0    0    0 4653    0  122    1   24]
 [ 622    5  491  133  385    0 3114    0   49    1]
 [   0    0    0    0    0  110    0 4582    4  104]
 [  15    2   15   21   10    3   50    7 4676    1]
 [   0    0    0    0    0   29    0  131    2 4638]]
              precision    recall  f1-score   support

 T-shirt/top       0.83      0.85      0.84      4800
     Trouser       0.99      0.98      0.99      4800
    Pullover       0.79      0.79      0.79      4800
       Dress       0.89      0.91      0.90      4800
        Coat       0.79      0.82      

Ty le 7/3

In [29]:
# Với tỷ lệ train/validation : 7/3
X_train1, X_val1, y_train1, y_val1, X_test1 = create_data(X=X, y=y, X_test=X_test, n_components=100, test_size=0.3)
X_train1.shape, X_val1.shape, X_test1.shape

((42000, 784), (18000, 784), (10000, 784))

In [32]:
# solver 'lbfgs' voi ty le 7/3
train_predict_evaluate_each_parameter(X_train1, y_train1, X_val1, y_val1, solver='lbfgs', verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


----------------Trainning set----------------
Accuracy: 0.892
Precision: 0.891
Recall: 0.892
Confusion Matrix:
 [[3607    6   65  144   19    1  330    0   28    0]
 [   1 4147    1   41    5    0    5    0    0    0]
 [  51    1 3317   34  439    0  341    0   17    0]
 [ 107   18   30 3858  102    0   76    0    9    0]
 [  19    2  316  109 3448    0  293    0   13    0]
 [   0    0    0    0    0 4096    0   95    0    9]
 [ 510    5  427  123  337    0 2757    0   41    0]
 [   0    0    0    0    0   96    0 4031    1   72]
 [   9    0    9   11   11    0   36    1 4123    0]
 [   0    0    0    0    0   13    0   99    0 4088]]
              precision    recall  f1-score   support

 T-shirt/top       0.84      0.86      0.85      4200
     Trouser       0.99      0.99      0.99      4200
    Pullover       0.80      0.79      0.79      4200
       Dress       0.89      0.92      0.91      4200
        Coat       0.79      0.82      0.81      4200
      Sandal       0.97      0.9

In [41]:
# solver 'saga' voi ty le 7/3
train_predict_evaluate_each_parameter(X_train1, y_train1, X_val1, y_val1, solver='saga', verbose=1)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 1018 epochs took 1101 seconds
----------------Trainning set----------------
Accuracy: 0.887
Precision: 0.886
Recall: 0.887
Confusion Matrix:
 [[3604   10   63  146   14    4  324    0   35    0]
 [   3 4127    5   51    6    0    7    0    1    0]
 [  58    3 3299   31  459    0  333    0   17    0]
 [ 116   22   26 3828  109    0   90    0    9    0]
 [  13    3  324  106 3439    0  303    0   12    0]
 [   0    0    0    0    0 4068    0  109    3   20]
 [ 519    5  443  127  341    0 2726    0   38    1]
 [   0    0    0    0    0   99    0 4005    4   92]
 [  13    1   12   18    8    5   49    6 4087    1]
 [   0    0    0    0    0   26    0  113    1 4060]]
              precision    recall  f1-score   support

 T-shirt/top       0.83      0.86      0.85      4200
     Trouser       0.99      0.98      0.99      4200
    Pullover       0.79      0.79      0.79      4200
       Dress       0.89      0.91      0.90      4200
        Coat       0.79      0.82     

In [None]:
# solver 'sag' voi ty le 7/3
train_predict_evaluate_each_parameter(X_train1, y_train1, X_val1, y_val1, solver='sag', verbose=1)



convergence after 1024 epochs took 1586 seconds
----------------Trainning set----------------
Accuracy: 0.889
Precision: 0.888
Recall: 0.889
Confusion Matrix:
 [[3607    8   63  145   15    3  323    0   36    0]
 [   2 4129    5   50    6    0    7    0    1    0]
 [  51    2 3303   31  454    0  342    0   17    0]
 [ 113   22   25 3841  105    0   86    0    8    0]
 [  14    3  319  107 3443    0  302    0   12    0]
 [   0    0    0    0    0 4076    0  107    1   16]
 [ 519    5  436  126  334    0 2740    0   40    0]
 [   0    0    0    0    0   97    0 4012    3   88]
 [  11    1   13   15    8    1   43    5 4102    1]
 [   0    0    0    0    0   22    0  110    0 4068]]
              precision    recall  f1-score   support

 T-shirt/top       0.84      0.86      0.85      4200
     Trouser       0.99      0.98      0.99      4200
    Pullover       0.79      0.79      0.79      4200
       Dress       0.89      0.91      0.90      4200
        Coat       0.79      0.82     

Ty le 6/4

In [36]:
# Với tỷ lệ train/validation : 6/4
X_train2, X_val2, y_train2, y_val2, X_test2 = create_data(X=X, y=y, X_test=X_test, n_components=100, test_size=0.4)
X_train2.shape, X_val2.shape, X_test2.shape

((36000, 784), (24000, 784), (10000, 784))

In [None]:
# solver 'lbfgs' voi ty le 6/4
train_predict_evaluate_each_parameter(X_train2, y_train2, X_val2, y_val2, solver='lbfgs', verbose=1)



----------------Trainning set----------------
Accuracy: 0.898
Precision: 0.897
Recall: 0.898
Confusion Matrix:
 [[3108    4   52  126   14    0  280    0   16    0]
 [   0 3559    2   31    3    0    5    0    0    0]
 [  49    2 2862   30  367    0  278    0   12    0]
 [  84   13   22 3323   83    0   68    0    7    0]
 [  12    3  265   88 2959    0  265    0    8    0]
 [   0    0    0    0    0 3526    0   71    0    3]
 [ 408    2  340  103  282    0 2435    0   30    0]
 [   0    0    0    0    0   65    0 3486    0   49]
 [   7    1    4    6    5    0   26    1 3550    0]
 [   0    0    0    0    0    7    0   62    0 3531]]
              precision    recall  f1-score   support

 T-shirt/top       0.85      0.86      0.86      3600
     Trouser       0.99      0.99      0.99      3600
    Pullover       0.81      0.80      0.80      3600
       Dress       0.90      0.92      0.91      3600
        Coat       0.80      0.82      0.81      3600
      Sandal       0.98      0.9

In [None]:
# solver 'saga' voi ty le 6/4
train_predict_evaluate_each_parameter(X_train2, y_train2, X_val2, y_val2, solver='saga', verbose=1)



convergence after 1309 epochs took 2245 seconds
----------------Trainning set----------------
Accuracy: 0.892
Precision: 0.891
Recall: 0.892
Confusion Matrix:
 [[3098    7   57  119   15    3  280    0   21    0]
 [   3 3535    6   43    6    0    7    0    0    0]
 [  52    3 2855   25  381    0  270    0   14    0]
 [  92   17   23 3301   91    0   69    0    7    0]
 [  15    2  278   86 2958    0  255    0    6    0]
 [   0    0    0    0    0 3492    0   89    4   15]
 [ 421    5  347  102  288    0 2402    0   35    0]
 [   0    0    0    0    0   74    0 3457    3   66]
 [   7    1    7   16    4    2   38    7 3517    1]
 [   0    0    0    0    0   25    0   86    0 3489]]
              precision    recall  f1-score   support

 T-shirt/top       0.84      0.86      0.85      3600
     Trouser       0.99      0.98      0.99      3600
    Pullover       0.80      0.79      0.80      3600
       Dress       0.89      0.92      0.91      3600
        Coat       0.79      0.82     

In [None]:
# solver 'sag' voi ty le 6/4
train_predict_evaluate_each_parameter(X_train2, y_train2, X_val2, y_val2, solver='sag', verbose=1)



convergence after 1337 epochs took 1831 seconds
----------------Trainning set----------------
Accuracy: 0.893
Precision: 0.893
Recall: 0.893
Confusion Matrix:
 [[3104    3   56  117   16    3  281    0   20    0]
 [   3 3539    5   43    5    0    5    0    0    0]
 [  47    3 2851   26  381    0  278    0   14    0]
 [  90   16   24 3305   90    0   68    0    7    0]
 [  15    1  273   87 2956    0  260    0    8    0]
 [   0    0    0    0    0 3506    0   82    1   11]
 [ 423    4  348   99  287    0 2401    0   38    0]
 [   0    0    1    0    0   69    0 3469    1   60]
 [   5    1    7   14    4    0   35    4 3529    1]
 [   0    0    0    0    0   18    0   78    0 3504]]
              precision    recall  f1-score   support

 T-shirt/top       0.84      0.86      0.85      3600
     Trouser       0.99      0.98      0.99      3600
    Pullover       0.80      0.79      0.80      3600
       Dress       0.90      0.92      0.91      3600
        Coat       0.79      0.82     