In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split
import time
import os

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

In [None]:
data_dir = "./private_data/merged_result_data_shift_half_1/data"
file_names = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]

train_df = pd.read_csv('./archive/fashion-mnist_train.csv', sep=',')
test_df = pd.read_csv('./archive/fashion-mnist_test.csv', sep=',')
private_df = pd.read_csv('./Private_data_csv/Private_data/Private_data.csv', sep=',')

train_data = np.array(train_df, dtype = 'float32')
test_data = np.array(test_df, dtype = 'float32')
private_data = np.array(private_df, dtype = 'float32')

X_train = train_data[:, 1:] # 학습데이터
y_train = train_data[:, 0] # 학습데이터의 정답 레이블

X_test = test_data[:, 1:] # 테스트데이터
y_test = test_data[:, 0] # 테스트데이터의 정답 레이블

In [None]:
stdscaler = StandardScaler()
X_train_scale = stdscaler.fit_transform(X_train)
X_test_scale = stdscaler.transform(X_test)
private_scale = stdscaler.transform(private_data)

pca = PCA(n_components=187, random_state=42)
X_train_pca = pca.fit_transform(X_train_scale)
X_test_pca = pca.transform(X_test_scale)
private_pca = pca.transform(private_scale)

In [None]:
logi = LogisticRegression(max_iter=200, solver='liblinear')
svm_rbf = svm.SVC(C=13, kernel='rbf', gamma='auto', probability=True)
RF = ensemble.RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=100)
Gboost = ensemble.GradientBoostingClassifier(n_estimators=100)
XGB = XGBClassifier(use_label_encoder=False, objective='multi:softmax', eval_metric='merror')

In [None]:
vote4_soft = ensemble.VotingClassifier([('logistic',logi),
                                  ('RandomForest',RF),
                                  ('Gradient_B',Gboost),
                                  ('svm_rbf',svm_rbf)],
                                  voting='soft',
                                  n_jobs=3, verbose=True)

vote4_soft.fit(X_train_pca, y_train)
vote4_soft_y_pred = vote4_soft.predict(X_test_pca)
vote4_soft_acc = accuracy_score(y_test, vote4_soft_y_pred)
print("voting(logistic,RF,GradientBoost,최적svm) 정확도", vote4_soft_acc)

# 각 test 샘플의 예측 결과를 텍스트 파일에 저장
result_file_path = 'testRsult_Vote4_soft_y.txt'

with open(result_file_path, "w") as f:
    for f_name, prediction in zip(file_names, vote4_soft_y_pred):
        prediction = int(prediction)
        f_name = f_name.replace(".png","")
        f.write(f"{f_name} {prediction}\n")

voting(logistic,RF,GradientBoost,최적svm) 정확도 0.8927


In [None]:
# 각 private 샘플의 예측 결과를 텍스트 파일에 저장
vote4_soft_pivate = vote4_soft.predict(private_pca)
result_file_path = 'testRsult_Vote4_soft_private.txt'

with open(result_file_path, "w") as f:
    for f_name, prediction in zip(file_names, vote4_soft_pivate):
        prediction = int(prediction)
        f_name = f_name.replace(".png","")
        f.write(f"{f_name} {prediction}\n")