In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import cv2
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.ndimage import rotate

In [3]:
# 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Feature(X)와 Target(y) 분리
X = train.iloc[:, 2:].values  # 이미지 데이터 (1024 픽셀)
y = train["label"].values     # 정답 라벨
X_test = test.iloc[:, 1:].values  # 테스트 데이터

# 이미지 형태로 변환
X = X.reshape(-1, 32, 32)
X_test = X_test.reshape(-1, 32, 32)

# 데이터 증강 함수
def augment_image(image):
    aug_images = []
    
    # 원본
    aug_images.append(image)
    
    # 좌우 반전
    aug_images.append(np.fliplr(image))
    
    # 밝기 조절
    brightness_factor = np.random.uniform(0.85, 1.15)
    aug_images.append(np.clip(image * brightness_factor, 0, 255).astype(np.uint8))
    
    # 가우시안 노이즈 추가
    noise = np.random.normal(0, 15, image.shape)
    aug_images.append(np.clip(image + noise, 0, 255).astype(np.uint8))
    
    # 회전 (15도, -15도)
    aug_images.append(rotate(image, angle=15, reshape=False))
    aug_images.append(rotate(image, angle=-15, reshape=False))
    
    return [img.flatten() for img in aug_images]

# 데이터 증강 적용
augmented_images = []
augmented_labels = []
for img, label in zip(X, y):
    augmented_imgs = augment_image(img)
    augmented_images.extend(augmented_imgs)
    augmented_labels.extend([label] * len(augmented_imgs))

X_augmented = np.array(augmented_images)
y_augmented = np.array(augmented_labels)

# 2D 데이터로 변환
X_test = X_test.reshape(X_test.shape[0], -1)

# 라벨을 숫자로 변환
label_encoder = LabelEncoder()
y_augmented = label_encoder.fit_transform(y_augmented)

# 데이터 정규화
scaler = StandardScaler()
X_augmented = scaler.fit_transform(X_augmented)
X_test = scaler.transform(X_test)

# Stratified K-Fold (5-Fold 적용)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []


In [None]:

for train_idx, valid_idx in skf.split(X_augmented, y_augmented):
    X_train, X_valid = X_augmented[train_idx], X_augmented[valid_idx]
    y_train, y_valid = y_augmented[train_idx], y_augmented[valid_idx]
    
    model = lgb.LGBMClassifier(
        num_leaves=100,
        learning_rate=0.03,
        n_estimators=2500,
        max_depth=10,
        feature_fraction=0.85,
        bagging_fraction=0.85,
        bagging_freq=5,
        min_child_samples=30,
        lambda_l1=0.2,
        lambda_l2=0.2,
        colsample_bytree=0.85,
        objective="multiclass",
        num_class=len(np.unique(y_augmented))
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="multi_logloss",
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
    )
    
    y_valid_pred = model.predict(X_valid)
    accuracies.append(accuracy_score(y_valid, y_valid_pred))
    
print("Cross-validation Accuracy:", np.mean(accuracies))

# 최적 모델로 전체 데이터 학습
final_model = lgb.LGBMClassifier(
    num_leaves=100,
    learning_rate=0.03,
    n_estimators=2500,
    max_depth=10,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=5,
    min_child_samples=30,
    lambda_l1=0.2,
    lambda_l2=0.2,
    colsample_bytree=0.85,
    objective="multiclass",
    num_class=len(np.unique(y_augmented))
)

final_model.fit(X_augmented, y_augmented)

# 테스트 데이터 예측
y_pred = final_model.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# 제출 파일 생성
submission = pd.read_csv("sample_submission.csv")
submission["label"] = y_pred_labels
submission.to_csv("final_submission.csv", index=False, encoding="utf-8-sig")

print("Final submission saved as 'final_submission.csv'")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 217558
[LightGBM] [Info] Number of data points in the train set: 3691, number of used features: 1024
[LightGBM] [Info] Start training from score -2.091160
[LightGBM] [Info] Start training from score -2.712394
[LightGBM] [Info] Start training from score -3.037503
[LightGBM] [Info] Start training from score -1.866263
[LightGBM] [Info] Start training from score -1.963677
[LightGBM] [Info] Start training from score -2.145227
[LightGBM] [Info] Start training from score -2.168647
[LightGBM] [Info] Start training from score -3.598532
[LightGBM] [Info] Start training from score -1.792030
[LightGBM] [Info] Start training from score -3.176700
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.694092
[100]	valid_0's multi_logloss: 0.396335
[150]	valid_0's multi_loglo

Exception ignored on calling ctypes callback function: <function _log_callback at 0x000001B91704C3A0>
Traceback (most recent call last):
  File "c:\Users\sanna\miniconda3\envs\myenv\lib\site-packages\lightgbm\basic.py", line 257, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[300]	valid_0's multi_logloss: 0.210054
[350]	valid_0's multi_logloss: 0.202685
[400]	valid_0's multi_logloss: 0.199048
[450]	valid_0's multi_logloss: 0.197481
[500]	valid_0's multi_logloss: 0.196429
[550]	valid_0's multi_logloss: 0.195768
[600]	valid_0's multi_logloss: 0.195171
[650]	valid_0's multi_logloss: 0.195026
[700]	valid_0's multi_logloss: 0.194926
[750]	valid_0's multi_logloss: 0.194781
[800]	valid_0's multi_logloss: 0.194799
Early stopping, best iteration is:
[748]	valid_0's multi_logloss: 0.194775
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 217512
[LightGBM] [Info] Number of data points in the train set: 3691, number of used features: 1024
[LightGBM] [Info] Start training from score -2.091160
[LightGBM] [Info] Start training from score -2.712394
[LightGBM] [Info] Start trainin