In [52]:
import pandas as pd

In [53]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [54]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [55]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [259]:
X = pd.read_csv("../data/preprocessing/X_003.csv")
y = train['SUBCLASS']

In [260]:
test = pd.read_csv("../data/preprocessing/Y_003.csv")

In [262]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [306]:
from sklearn.decomposition import PCA

In [307]:
# PCA로 차원 축소 (예: 500개의 주요 구성 요소로 축소)
pca = PCA(n_components=500)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
test_pca = pca.transform(test)

In [331]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization

In [352]:
# 모델 정의
model = Sequential()
model.add(Dense(256, input_dim=x_train.shape[1], activation='relu'))  # 입력 레이어 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))  # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization()) 
model.add(Dense(64, activation='relu'))   # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization()) 
model.add(Dense(26, activation='softmax'))  # 출력 레이어 (다중 클래스)

In [353]:
from tensorflow.keras.optimizers import Adam

In [354]:
# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [355]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [356]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

In [357]:
# 모델 학습
history = model.fit(x_train, y_train, 
                    validation_data=(x_test, y_test),
                    epochs=50, batch_size=32,
                    callbacks=[reduce_lr, early_stopping])

Epoch 1/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.0616 - loss: 3.6710 - val_accuracy: 0.1757 - val_loss: 3.0412 - learning_rate: 0.0010
Epoch 2/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1655 - loss: 2.9532 - val_accuracy: 0.1961 - val_loss: 2.8566 - learning_rate: 0.0010
Epoch 3/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.2161 - loss: 2.7498 - val_accuracy: 0.2020 - val_loss: 2.7944 - learning_rate: 0.0010
Epoch 4/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2491 - loss: 2.5564 - val_accuracy: 0.2160 - val_loss: 2.7305 - learning_rate: 0.0010
Epoch 5/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.2881 - loss: 2.4151 - val_accuracy: 0.2176 - val_loss: 2.7077 - learning_rate: 0.0010
Epoch 6/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [358]:
import numpy as np

In [359]:
# 테스트 데이터 예측
predictions = model.predict(test_pca)
predicted_classes = np.argmax(predictions, axis=1)

# 예측 결과 출력
print(predicted_classes)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[ 2 21  8 ... 21 19 11]


In [360]:
np.unique(predicted_classes)

array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 18,
       19, 20, 21, 22, 23, 25], dtype=int64)

In [361]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [362]:
test_predict = le.inverse_transform(predicted_classes)

In [364]:
submission['SUBCLASS'] = test_predict

In [367]:
submission['SUBCLASS'].value_counts()

SUBCLASS
BRCA      634
KIPAN     424
GBMLGG    282
STES      225
COAD      184
OV         96
LAML       93
SKCM       86
HNSC       76
THCA       68
KIRC       51
PRAD       50
LUAD       47
UCEC       46
LUSC       39
CESC       29
ACC        27
SARC       24
LGG        23
TGCT       21
BLCA        8
LIHC        8
PAAD        5
Name: count, dtype: int64

In [368]:
submission.to_csv("../data/submission/submission_13.csv", index=False)