세번째 제출 23.11.21
###### 전처리 : /255.0 
###### pca :100
###### svm (c = 20) + xgb  -> voting ensemble
###### 이미지 증강(좌, 우 쉬프트 >> 약 100000장 학습)

결과 78.38

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

import os
from tqdm import tqdm
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from sklearn.metrics import accuracy_score


In [2]:
training_data = pd.read_csv('../archive/fashion-mnist_train.csv')

train_y = training_data['label']

train_X = training_data.drop('label',axis=1)


In [3]:
datagen = ImageDataGenerator(
    width_shift_range=0.05,
    height_shift_range=0.05
)

np.random.seed(42)

# Data augmentation and adding to the training set loop
aug_train_X = []
aug_train_y = []

for index, row in tqdm(train_X.iterrows(), total=len(train_X)):
    random_num = np.random.random()

    img = row.values.reshape((28, 28, 1))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)

    # Original data
    aug_train_X.append(img_array[0])
    aug_train_y.append(train_y[index])

    # Augmented data with 2/3 probability
    if random_num > 0.33:
        augmented_img_array = next(datagen.flow(img_array, batch_size=1))
        augmented_img_array = augmented_img_array.squeeze(axis=0)
        aug_train_X.append(augmented_img_array)
        aug_train_y.append(train_y[index])

# Convert the lists to numpy arrays
aug_train_X = np.array(aug_train_X)
aug_train_y = np.array(aug_train_y)

# Check the shape of the new arrays
print("Shape of aug_train_X:", aug_train_X.shape)
print("Shape of aug_train_y:", aug_train_y.shape)


100%|██████████| 60000/60000 [00:08<00:00, 6855.03it/s]


Shape of aug_train_X: (100123, 28, 28, 1)
Shape of aug_train_y: (100123,)


In [4]:
aug_train_X = aug_train_X.reshape((aug_train_X.shape[0], -1))
aug_train_X/=255.0



In [5]:
pca = PCA(n_components= 100 )
aug_train_X = pca.fit_transform(aug_train_X)


In [6]:
svc = SVC(gamma='scale', kernel='rbf', C=20)
xgb = XGBClassifier(use_label_encoder=False,objective="multi:softmax",eval_metric="merror")

ensemble_model = VotingClassifier(estimators=[
    ('svm', svc),
    ('xgb', xgb)
], voting='hard')

ensemble_model.fit(aug_train_X, aug_train_y)

In [7]:
test_data_folder = "../dataset/private_test_dataset/data/"

# 테스트 데이터 로드 및 전처리
test_X = []
file_names = []

print(len(os.listdir(test_data_folder)))
# tqdm으로 래핑
for file_name in tqdm(os.listdir(test_data_folder), desc="Loading and preprocessing"):
    if file_name.endswith(".png"):
        file_path = os.path.join(test_data_folder, file_name)
        try:
            image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
            image_array = image / 255.0  # 0부터 1사이의 값
            test_X.append(image_array.flatten())  # 2D 배열을 1D로 펼침
            
            file_names.append(file_name)
        except Exception as e:
            print(f"Error processing file {file_name}: {str(e)}")


# NumPy 배열로 변환
test_X = np.array(test_X)
print(test_X.shape)

if len(test_X) == 0:
    print("No valid test images found.")
else:
    print("pca 적용")
    test_X_pca = pca.transform(test_X)
    print(test_X_pca.shape)

    predictions = ensemble_model.predict(test_X_pca)
    

15000


Loading and preprocessing:   0%|          | 0/15000 [00:00<?, ?it/s]

Loading and preprocessing: 100%|██████████| 15000/15000 [00:01<00:00, 9542.31it/s] 


(15000, 784)
pca 적용
(15000, 100)


In [8]:

with open('testResult4(다반1조).txt', 'w') as file:
    for i, pred in enumerate(predictions):
        file.write(f"{i:05d} {pred}\n")

In [9]:
for i in range(10):
    print(predictions[i])

9
6
1
1
7
1
6
8
7
1
