In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [None]:
# 1. 데이터 로드 및 전처리
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X = train.iloc[:, 2:].values.reshape(-1, 1, 32, 32)  # 1채널 (Grayscale)
y = train['label'].values
X_test = test.iloc[:, 1:].values.reshape(-1, 1, 32, 32)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# 2. CNN 모델 정의
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc = nn.Linear(64 * 8 * 8, 256)  # Feature Vector (256차원)

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [None]:
# 3. CNN을 이용한 Feature 추출
cnn = CNNFeatureExtractor()
def extract_features(model, X_data):
    model.eval()
    X_tensor = torch.tensor(X_data, dtype=torch.float32)
    with torch.no_grad():
        features = model(X_tensor).numpy()
    return features

X_train_features = extract_features(cnn, X_train)
X_valid_features = extract_features(cnn, X_valid)
X_test_features = extract_features(cnn, X_test)

In [None]:
# 4. LightGBM 분류 모델 학습
model = LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42)


from lightgbm import early_stopping, log_evaluation

model.fit(
    X_train_features, y_train,
    eval_set=[(X_valid_features, y_valid)],
    eval_metric='multi_logloss',
    callbacks=[log_evaluation(10)]  # 10번째 에포크마다 로그 출력
)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52657
[LightGBM] [Info] Number of data points in the train set: 615, number of used features: 256
[LightGBM] [Info] Start training from score -2.090889
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -3.020425
[LightGBM] [Info] Start training from score -1.867745
[LightGBM] [Info] Start training from score -1.967275
[LightGBM] [Info] Start training from score -2.144956
[LightGBM] [Info] Start training from score -2.173127
[LightGBM] [Info] Start training from score -3.588409
[LightGBM] [Info] Start training from score -1.796649
[LightGBM] [Info] Start training from score -3.163526
[10]	valid_0's multi_logloss: 1.50218
[20]	valid_0's multi_logloss: 1.22901
[30]	valid_0's multi_logloss: 1.07607
[40]	valid_0's multi_logloss: 0.969168
[50]	valid_0's mul

In [None]:
# 5. 예측 및 결과 저장
y_valid_pred = model.predict(X_valid_features)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print("Validation Accuracy:", valid_accuracy)

y_pred = model.predict(X_test_features)
y_pred_labels = label_encoder.inverse_transform(y_pred)

submission = pd.read_csv('sample_submission.csv')
submission['label'] = y_pred_labels
submission.to_csv('submission_cnn_lgb_2.csv', index=False, encoding='utf-8-sig')

Validation Accuracy: 0.7597402597402597




2회차

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")  # 모든 Warning 제거

# 1. 데이터 로드 및 전처리
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X = train.iloc[:, 2:].values.reshape(-1, 1, 32, 32)  # 1채널 (Grayscale)
y = train['label'].values
X_test = test.iloc[:, 1:].values.reshape(-1, 1, 32, 32)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. CNN 모델 정의
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        self.fc = nn.Linear(256 * 4 * 4, 512)  # Feature Vector (512차원)

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# 3. CNN을 이용한 Feature 추출
cnn = CNNFeatureExtractor()
def extract_features(model, X_data):
    model.eval()
    X_tensor = torch.tensor(X_data, dtype=torch.float32)
    with torch.no_grad():
        features = model(X_tensor).numpy()
    return features

X_train_features = extract_features(cnn, X_train)
X_valid_features = extract_features(cnn, X_valid)
X_test_features = extract_features(cnn, X_test)

# 4. LightGBM 분류 모델 학습
from lightgbm import early_stopping, log_evaluation

model = LGBMClassifier(
    n_estimators=500,  # 더 많은 트리 사용
    learning_rate=0.03,  # 학습률 감소
    max_depth=15,  # 더 깊은 트리
    num_leaves=128,  # 더 많은 리프 노드 사용
    colsample_bytree=0.8,  # 컬럼 샘플링
    subsample=0.8,  # 데이터 샘플링
    random_state=42
)

model.fit(
    X_train_features, y_train,
    eval_set=[(X_valid_features, y_valid)],
    eval_metric='multi_logloss',
    callbacks=[early_stopping(20), log_evaluation(10)]  # 조기 종료, 로그 출력
)

# 5. 예측 및 결과 저장
y_valid_pred = model.predict(X_valid_features)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print("Validation Accuracy:", valid_accuracy)

y_pred = model.predict(X_test_features)
y_pred_labels = label_encoder.inverse_transform(y_pred)

submission = pd.read_csv('sample_submission.csv')
submission['label'] = y_pred_labels
submission.to_csv('submission_cnn_lgbm.csv', index=False, encoding='utf-8-sig')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105383
[LightGBM] [Info] Number of data points in the train set: 615, number of used features: 512
[LightGBM] [Info] Start training from score -2.090889
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Start training from score -3.020425
[LightGBM] [Info] Start training from score -1.867745
[LightGBM] [Info] Start training from score -1.967275
[LightGBM] [Info] Start training from score -2.144956
[LightGBM] [Info] Start training from score -2.173127
[LightGBM] [Info] Start training from score -3.588409
[LightGBM] [Info] Start training from score -1.796649
[LightGBM] [Info] Start training from score -3.163526
Training until validation scores don't improve for 20 rounds
[10]	valid_0's multi_logloss: 1.62566
[20]	valid_0's multi_logloss: 1.35156
[30]	valid_0's multi_logloss: 1.1

3회차

In [2]:
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Warning 제거
warnings.filterwarnings("ignore")

# 1. 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X = train.iloc[:, 2:].values  # 이미지 데이터 (32x32 = 1024 픽셀)
y = train["label"].values

X_test = test.iloc[:, 1:].values  # 테스트 데이터

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. CNN Feature Extractor 정의
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        self.fc = nn.Sequential(
            nn.Linear(512 * 4 * 4, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512)
        )

    def forward(self, x):
        x = x.view(-1, 1, 32, 32)  # 이미지 형태로 변환
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# 3. CNN 학습 함수
def train_cnn(model, X_train, y_train, epochs=30, batch_size=64, lr=0.001):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    dataset = data.TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                 torch.tensor(y_train, dtype=torch.long))
    dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        running_loss = 0.0
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(dataloader):.4f}")

# 4. CNN을 이용한 Feature 추출
def extract_features(model, X_data):
    model.eval()
    with torch.no_grad():
        features = model(torch.tensor(X_data, dtype=torch.float32))
    return features.numpy()

# CNN 학습 실행
cnn = CNNFeatureExtractor()
train_cnn(cnn, X_train, y_train, epochs=30)


# 1. CNN Feature 추출
X_train_features = extract_features(cnn, X_train)
X_valid_features = extract_features(cnn, X_valid)
X_test_features = extract_features(cnn, X_test)

# 2. LightGBM 학습
model = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=20,
    num_leaves=256,
    colsample_bytree=0.7,
    subsample=0.7,
    random_state=42
)

model.fit(
    X_train_features, y_train,
    eval_set=[(X_valid_features, y_valid)],
    eval_metric='multi_logloss'
)

# 3. 모델 성능 평가
y_valid_pred = model.predict(X_valid_features)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print("\nValidation Accuracy:", valid_accuracy)

# 4. 최종 예측 및 제출 파일 생성
y_pred = model.predict(X_test_features)
y_pred_labels = label_encoder.inverse_transform(y_pred)

submission = pd.read_csv('sample_submission.csv')
submission['label'] = y_pred_labels
submission.to_csv('./final_submission.csv', index=False, encoding='utf-8-sig')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m

Validation Accuracy: 0.8896103896103896
