# Overall idea
## Processing of the data.
* 1. Binarize the data (turn each image into a black and white image) and place each pixel with a pixel value less than or equal to 0.4 at 0 and greater than 0.4 at 1
* 2. Expand the training set by flipping each image vertically and adding it to the training set

## Parameter selection.
Select the best regularization parameter C for the logistic regression classifier with Bayesian optimizer to achieve the highest accuracy

In [6]:
import pandas as pd
import numpy as np
# import cv2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from bayes_opt import BayesianOptimization

In [7]:
# Read the data and convert it to a numpy array, where traget is converted to a one-dimensional array
data = pd.read_csv("./data_trouser_dress/troudress_train_x.csv").values
target = pd.read_csv("./data_trouser_dress/troudress_train_y.csv").values.reshape(-1)
print(data,data.shape)
print(target,target.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 1. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [1. 0. 0. ... 0. 1. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]] (12000, 784)
[0 1 1 ... 1 0 0] (12000,)


In [13]:
# Dividing the data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data,target,test_size=0.3,random_state=0)
print(X_train.shape)

(8400, 784)


First, without processing the data, fit the logistic regression classifier

In [5]:
# The optimal hyperparameter C, the regularization parameter, is selected using a Bayesian optimizer
def f_score(C):
    lr = LogisticRegression(max_iter=200,C=C)
    lr.fit(X_train,y_train)
    acc = accuracy_score(y_test,lr.predict(X_test))
    return acc     # The goal of adjusting parameters to maximize accuracy

# Determine the range of parameter values
pbounds = {'C': (0,1)}

# Constructing a Bayesian Optimizer
opt = BayesianOptimization(
    f=f_score,
    pbounds=pbounds,
    verbose=2,  # verbose = 2 prints all, verbose = 1 prints the maximum value found in the run, verbose = 0 prints nothing
    random_state=1
)

# Start running
opt.maximize(
    init_points=10,  # Steps of random search
    n_iter=130  # Number of iterations to perform Bayesian optimization
)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9306  [0m | [0m 0.417   [0m |
| [0m 2       [0m | [0m 0.9253  [0m | [0m 0.7203  [0m |
| [95m 3       [0m | [95m 0.9419  [0m | [95m 0.000114[0m |
| [0m 4       [0m | [0m 0.9344  [0m | [0m 0.3023  [0m |
| [0m 5       [0m | [0m 0.9414  [0m | [0m 0.1468  [0m |
| [95m 6       [0m | [95m 0.9442  [0m | [95m 0.09234 [0m |
| [0m 7       [0m | [0m 0.9397  [0m | [0m 0.1863  [0m |
| [0m 8       [0m | [0m 0.9331  [0m | [0m 0.3456  [0m |
| [0m 9       [0m | [0m 0.9311  [0m | [0m 0.3968  [0m |
| [0m 10      [0m | [0m 0.9278  [0m | [0m 0.5388  [0m |
| [0m 11      [0m | [0m 0.9242  [0m | [0m 1.0     [0m |
| [95m 12      [0m | [95m 0.9475  [0m | [95m 0.05673 [0m |
| [95m 13      [0m | [95m 0.9492  [0m | [95m 0.04103 [0m |
| [0m 14      [0m | [0m 0.9242  [0m | [0m 0.8578  [0m |
| [0m 15      [0m | [0m 0.9414  [0m | [

| [0m 132     [0m | [0m 0.9369  [0m | [0m 0.2547  [0m |
| [0m 133     [0m | [0m 0.9278  [0m | [0m 0.5469  [0m |
| [0m 134     [0m | [0m 0.9453  [0m | [0m 0.07937 [0m |
| [0m 135     [0m | [0m 0.9272  [0m | [0m 0.5632  [0m |
| [0m 136     [0m | [0m 0.9272  [0m | [0m 0.5793  [0m |
| [0m 137     [0m | [0m 0.9267  [0m | [0m 0.5956  [0m |
| [0m 138     [0m | [0m 0.9356  [0m | [0m 0.2753  [0m |
| [0m 139     [0m | [0m 0.9369  [0m | [0m 0.2447  [0m |
| [0m 140     [0m | [0m 0.9414  [0m | [0m 0.1434  [0m |


In [6]:
C1 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C1)

best C: 0.0059162059945443435


In [14]:
model = LogisticRegression(max_iter=200,C=0.0059162059945443435)
model.fit(X_train,y_train)

x_test = np.loadtxt('data_trouser_dress/troudress_test_x.csv', delimiter=',', skiprows=1)
yproba1_test = model.predict_proba(x_test)[:, 1]
np.savetxt('yproba1_test.txt', yproba1_test)

In [12]:
# Training on the training set
lr1 = LogisticRegression(max_iter=200,C=C1)
lr1.fit(X_train,y_train)


# Test accuracy in test set
pre = lr1.predict(X_test)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

Accuracy on the test set: 0.9552777777777778


Next, the training data is binarized (turning each image into a black and white image), with the pixel value of each pixel less than or equal to 0.4 placed at 0 and greater than 0.4 placed at 1. Then, again, a Bayesian optimizer is used to select the best regularization parameter C for the logistic regression classifier to achieve the highest accuracy

In [8]:
X_train_copy = X_train.copy()
# Binarization of all training images
for i in range(len(X_train_copy)):
    for j in range(len(X_train_copy[i])):
        if X_train_copy[i][j] <= 0.4:
            X_train_copy[i][j] = 0
        else:
            X_train_copy[i][j] = 1
X_train_copy

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
X_test_copy = X_test.copy()
# Binarization of all test images
for i in range(len(X_test_copy)):
    for j in range(len(X_test_copy[i])):
        if X_test_copy[i][j] <= 0.4:
            X_test_copy[i][j] = 0
        else:
            X_test_copy[i][j] = 1
X_test_copy

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [10]:
# The optimal hyperparameter C, the regularization parameter, is selected using a Bayesian optimizer
def f_score(C):
    lr = LogisticRegression(max_iter=200,C=C)
    lr.fit(X_train_copy,y_train)
    acc = accuracy_score(y_test,lr.predict(X_test_copy))
    return acc     # The goal of adjusting parameters to maximize accuracy


# Determine the range of parameter values
pbounds = {'C': (0,1)}

# Constructing a Bayesian Optimizer
opt = BayesianOptimization(
    f=f_score,
    pbounds=pbounds,
    verbose=2,  # verbose = 2 prints all, verbose = 1 prints the maximum value found in the run, verbose = 0 prints nothing
    random_state=1
)

# Start running
opt.maximize(
    init_points=10,  # Steps of random search
    n_iter=130  # Number of iterations to perform Bayesian optimization
)

C2 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C2)


|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9269  [0m | [0m 0.417   [0m |
| [0m 2       [0m | [0m 0.9219  [0m | [0m 0.7203  [0m |
| [95m 3       [0m | [95m 0.9356  [0m | [95m 0.000114[0m |
| [0m 4       [0m | [0m 0.9283  [0m | [0m 0.3023  [0m |
| [0m 5       [0m | [0m 0.9319  [0m | [0m 0.1468  [0m |
| [95m 6       [0m | [95m 0.9361  [0m | [95m 0.09234 [0m |
| [0m 7       [0m | [0m 0.9308  [0m | [0m 0.1863  [0m |
| [0m 8       [0m | [0m 0.9278  [0m | [0m 0.3456  [0m |
| [0m 9       [0m | [0m 0.9272  [0m | [0m 0.3968  [0m |
| [0m 10      [0m | [0m 0.9244  [0m | [0m 0.5388  [0m |
| [0m 11      [0m | [0m 0.9194  [0m | [0m 1.0     [0m |
| [95m 12      [0m | [95m 0.9442  [0m | [95m 0.04805 [0m |
| [95m 13      [0m | [95m 0.9444  [0m | [95m 0.03535 [0m |
| [0m 14      [0m | [0m 0.9208  [0m | [0m 0.8556  [0m |
| [0m 15      [0m | [0m 0.9347  [0m | [

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


| [0m 127     [0m | [0m 0.92    [0m | [0m 0.9716  [0m |
| [0m 128     [0m | [0m 0.9194  [0m | [0m 0.9906  [0m |
| [0m 129     [0m | [0m 0.9206  [0m | [0m 0.9354  [0m |
| [0m 130     [0m | [0m 0.9203  [0m | [0m 0.9532  [0m |
| [0m 131     [0m | [0m 0.9253  [0m | [0m 0.4796  [0m |
| [0m 132     [0m | [0m 0.9261  [0m | [0m 0.4634  [0m |
| [0m 133     [0m | [0m 0.9217  [0m | [0m 0.7434  [0m |
| [0m 134     [0m | [0m 0.9317  [0m | [0m 0.1515  [0m |
| [0m 135     [0m | [0m 0.9311  [0m | [0m 0.1614  [0m |
| [0m 136     [0m | [0m 0.9367  [0m | [0m 0.08973 [0m |
| [0m 137     [0m | [0m 0.9219  [0m | [0m 0.7281  [0m |
| [0m 138     [0m | [0m 0.9217  [0m | [0m 0.7741  [0m |
| [0m 139     [0m | [0m 0.9217  [0m | [0m 0.7587  [0m |
| [0m 140     [0m | [0m 0.9217  [0m | [0m 0.7892  [0m |
best C: 0.0057077545877518565


In [13]:
# Training on the training set
lr2 = LogisticRegression(max_iter=200,C=C2)
lr2.fit(X_train_copy,y_train)


# Test accuracy in test set
pre = lr2.predict(X_test_copy)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

Accuracy on the test set: 0.9494444444444444


Next, the training set is expanded by flipping each image vertically, adding it to the training set, and using a Bayesian optimizer to select the best regularization parameter C for the logistic regression classifier to achieve the highest accuracy

In [19]:
# Flip all the images in the training set and expand them to the training set
X_train_copy = X_train.tolist()
y_train_copy = y_train.tolist()
# print(X_train.shape)
# print(y_train.shape)
for i in range(len(X_train)):
    temp_img = []
    img = X_train[i].reshape(28,28)
    img_new = cv2.flip(img, 0)  # Vertical Flip
    img_new = img_new.reshape(-1).tolist()
    X_train_copy.append(img_new)
    y_train_copy.append(y_train[i])
X_train = np.array(X_train_copy)
y_train = np.array(y_train_copy)
print(X_train.shape)  # Print the shape of x_train
print(y_train.shape)  # Print the shape of y_train


(16800, 784)
(16800,)


You can see that the shape of both x_train and y_train become twice the original

In [20]:
# The optimal hyperparameter C, the regularization parameter, is selected using a Bayesian optimizer
def f_score(C):
    lr = LogisticRegression(max_iter=200,C=C)
    lr.fit(X_train,y_train)
    acc = accuracy_score(y_test,lr.predict(X_test))
    return acc     # The goal of adjusting parameters to maximize accuracy

# Determine the range of parameter values
pbounds = {'C': (0,1)}

# Constructing a Bayesian Optimizer
opt = BayesianOptimization(
    f=f_score, 
    pbounds=pbounds,  
    verbose=2,  # verbose = 2 prints all, verbose = 1 prints the maximum value found in the run, verbose = 0 prints nothing
    random_state=1
)

# Start running
opt.maximize(
    init_points=10,  # Steps of random search
    n_iter=130  # Number of iterations to perform Bayesian optimization
)

C3 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C3)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9072  [0m | [0m 0.417   [0m |
| [0m 2       [0m | [0m 0.9072  [0m | [0m 0.7203  [0m |
| [0m 3       [0m | [0m 0.8931  [0m | [0m 0.000114[0m |
| [95m 4       [0m | [95m 0.9081  [0m | [95m 0.3023  [0m |
| [95m 5       [0m | [95m 0.9083  [0m | [95m 0.1468  [0m |
| [95m 6       [0m | [95m 0.9094  [0m | [95m 0.09234 [0m |
| [0m 7       [0m | [0m 0.9081  [0m | [0m 0.1863  [0m |
| [0m 8       [0m | [0m 0.9078  [0m | [0m 0.3456  [0m |
| [0m 9       [0m | [0m 0.9072  [0m | [0m 0.3968  [0m |
| [0m 10      [0m | [0m 0.9072  [0m | [0m 0.5388  [0m |
| [0m 11      [0m | [0m 0.9069  [0m | [0m 0.8739  [0m |
| [0m 12      [0m | [0m 0.9072  [0m | [0m 0.9999  [0m |
| [0m 13      [0m | [0m 0.9072  [0m | [0m 0.6299  [0m |
| [0m 14      [0m | [0m 0.9094  [0m | [0m 0.09788 [0m |
| [0m 15      [0m | [0m 0.9094  [0m | [0m 

| [0m 132     [0m | [0m 0.9072  [0m | [0m 0.9924  [0m |
| [0m 133     [0m | [0m 0.9119  [0m | [0m 0.05123 [0m |
| [0m 134     [0m | [0m 0.9072  [0m | [0m 0.4893  [0m |
| [0m 135     [0m | [0m 0.9072  [0m | [0m 0.5037  [0m |
| [0m 136     [0m | [0m 0.9075  [0m | [0m 0.3897  [0m |
| [0m 137     [0m | [0m 0.9072  [0m | [0m 0.5179  [0m |
| [0m 138     [0m | [0m 0.9072  [0m | [0m 0.5319  [0m |
| [0m 139     [0m | [0m 0.9078  [0m | [0m 0.3768  [0m |
| [0m 140     [0m | [0m 0.9078  [0m | [0m 0.3642  [0m |
best C: 0.032347468638519006


In [21]:
# Training on the training set
lr3 = LogisticRegression(max_iter=200,C=C3)
lr3.fit(X_train,y_train)


# Test accuracy in test set
pre = lr3.predict(X_test)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

Accuracy on the test set: 0.9141666666666667


As you can see, the accuracy of fitting the logistic regression classifier without any data processing is the highest, so we only save the logistic regression model without any data processing here

In [22]:
import joblib

joblib.dump(lr1,"./model/logistic_regression_classifier.m")

['./model/logistic_regression_classifier.m']