# 1

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [3]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [5]:
X = pd.read_csv("../data/preprocessing/X_003.csv")
y = train['SUBCLASS']

In [6]:
test = pd.read_csv("../data/preprocessing/Y_003.csv")

## PCA

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
X = pca.fit_transform(X)
test_pca = pca.transform(test)

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
scaler = MinMaxScaler()
scaler.fit(X)

In [10]:
X = scaler.transform(X)
test_pca = scaler.transform(test_pca)

## train_test_split

In [219]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## SVC

In [12]:
from sklearn.svm import SVC

In [13]:
model = SVC(C=1, gamma='scale',random_state=42)

In [14]:
model.fit(x_train, y_train)

In [15]:
model.score(x_train, y_train)

0.30806451612903224

In [16]:
model.score(x_test, y_test)

0.17033852767329394

## DL

In [220]:
from tensorflow.keras.utils import to_categorical

In [221]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [204]:
y_train = to_categorical(y_train, 26)
y_test = to_categorical(y_test, 26)

In [223]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization

In [246]:
len(x_train.columns)

13152

In [248]:
# 모델 정의
model = Sequential()
model.add(Dense(256, input_shape=(13152, ), activation='relu'))  # 입력 레이어 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))  # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization()) 
model.add(Dense(64, activation='relu'))   # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.2))
model.add(BatchNormalization()) 
model.add(Dense(26, activation='softmax'))  # 출력 레이어 (다중 클래스)

In [249]:
from tensorflow.keras.optimizers import Adam

In [250]:
# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [251]:
from tensorflow.keras.callbacks import Callback

In [252]:
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data  # validation_data를 직접 전달

    def on_epoch_end(self, epoch, logs=None):
        if self.validation_data is not None:  # validation_data가 None인지 확인
            X_val, y_val = self.validation_data
            y_pred = np.argmax(self.model.predict(X_val), axis=1)
            y_true = np.argmax(y_val, axis=1)
            
            # Micro F1 Score 계산
            micro_f1 = f1_score(y_true, y_pred, average='micro')
            print(f"Epoch {epoch+1} - Micro F1 Score: {micro_f1:.4f}")
        else:
            print("Validation data is not available.")


In [253]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [254]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

In [255]:
import numpy as np

In [256]:
f1_score_callback = F1ScoreCallback(validation_data=(x_test, y_test))

In [None]:
# 모델 학습
history = model.fit(x_train, y_train, 
                    validation_data=(x_test, y_test),
                    epochs=50, batch_size=32,
                    callbacks=[f1_score_callback]) # reduce_lr, early_stopping, 

Epoch 1/50


# 2

In [7]:
from tqdm import tqdm

X_change = pd.DataFrame()

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
for i in tqdm(X.columns):
    if '_position' in i:
        X_change[i] = X[i]

100%|████████████████████████████████████████████████████████| 13152/13152 [00:02<00:00, 4730.25it/s]


In [28]:
for i in tqdm(range(len(X.columns))):
    if i % 3 == 0:
        if -1 in X.iloc[:, i+2].values:
            X_change[X.columns[i].split('_')[0] + '_change'] = X.iloc[:,i+2] * X.iloc[:, i]
        else :
            X_change[X.columns[i].split('_')[0] + '_change'] = X.iloc[:,i+2] - X.iloc[:, i]

100%|████████████████████████████████████████████████████████| 13152/13152 [00:10<00:00, 1300.88it/s]


In [33]:
test_change = pd.DataFrame()

In [34]:
for i in tqdm(test.columns):
    if '_position' in i:
        test_change[i] = test[i]

100%|████████████████████████████████████████████████████████| 13152/13152 [00:02<00:00, 5227.72it/s]


In [35]:
for i in tqdm(range(len(test.columns))):
    if i % 3 == 0:
        if -1 in test.iloc[:, i+2].values:
            test_change[test.columns[i].split('_')[0] + '_change'] = test.iloc[:,i+2] * test.iloc[:, i]
        else :
            test_change[test.columns[i].split('_')[0] + '_change'] = test.iloc[:,i+2] - test.iloc[:, i]

100%|████████████████████████████████████████████████████████| 13152/13152 [00:08<00:00, 1598.16it/s]


In [37]:
print(X_change.shape, test_change.shape)

(6201, 8768) (2546, 8768)


In [38]:
X_change.to_csv("../data/preprocessing/X_change_001.csv", index=False)
test_change.to_csv("../data/preprocessing/test_change_001.csv", index=False)

## PCA

In [90]:
from sklearn.decomposition import PCA

pca = PCA(n_components=200)
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test)

In [91]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_pca)

In [92]:
X_pca = scaler.transform(X_pca)
test_pca = scaler.transform(test_pca)

In [93]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42, stratify=y)

## SVC

In [94]:
from sklearn.svm import SVC

In [95]:
model = SVC(C=1, gamma='scale',random_state=42)

In [96]:
model.fit(x_train, y_train)

In [97]:
model.score(x_train, y_train)

0.3741935483870968

In [98]:
model.score(x_test, y_test)

0.21601289629231596

## DL

In [109]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_change, y, test_size=0.3, random_state=42, stratify=y)

In [153]:
from tensorflow.keras.utils import to_categorical

In [154]:
import numpy as np

In [155]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization

In [172]:
# 모델 정의
model = Sequential()
model.add(Dense(1000, input_shape=(len(x_train.columns), ), activation='relu', kernel_regularizer=l2(0.1)))  # 입력 레이어 kernel_regularizer=l2(0.001)
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(500, activation='relu', kernel_regularizer=l2(0.1)))  # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.5))
model.add(BatchNormalization()) 
model.add(Dense(100, activation='relu', kernel_regularizer=l2(0.1)))   # 은닉층 kernel_regularizer=l2(0.001)
model.add(Dropout(0.5))
model.add(BatchNormalization()) 
model.add(Dense(26, activation='softmax'))  # 출력 레이어 (다중 클래스)

In [173]:
from tensorflow.keras.optimizers import Adam

In [174]:
# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [175]:
from tensorflow.keras.callbacks import Callback

In [176]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [177]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

In [178]:
import numpy as np

In [179]:
# 모델 학습
history = model.fit(x_train, y_train, 
                    validation_data=(x_test, y_test),
                    epochs=50, batch_size=32,
                    callbacks=[reduce_lr, early_stopping]) # reduce_lr, early_stopping, 

Epoch 1/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 47ms/step - accuracy: 0.0685 - loss: 88.1415 - val_accuracy: 0.1660 - val_loss: 12.8936 - learning_rate: 0.0010
Epoch 2/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.1611 - loss: 11.1856 - val_accuracy: 0.1908 - val_loss: 8.1043 - learning_rate: 0.0010
Epoch 3/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.1839 - loss: 8.0114 - val_accuracy: 0.1838 - val_loss: 7.9422 - learning_rate: 0.0010
Epoch 4/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.1995 - loss: 8.0222 - val_accuracy: 0.1929 - val_loss: 7.9832 - learning_rate: 0.0010
Epoch 5/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.2059 - loss: 7.8686 - val_accuracy: 0.1961 - val_loss: 7.6118 - learning_rate: 0.0010
Epoch 6/50
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [184]:
x_train.shape

(4340, 8768)

In [183]:
test.shape

(2546, 13152)

In [188]:
# 테스트 데이터
predictions = model.predict(test)
predicted_classes = np.argmax(predictions, axis=1)

# 예측 결과 출력
print(predicted_classes)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[ 7 21  8 ... 20 10  6]


In [189]:
np.unique(predicted_classes)

array([ 0,  1,  2,  3,  4,  6,  7,  8, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25], dtype=int64)

In [190]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [191]:
test_predict = le.inverse_transform(predicted_classes)

In [192]:
submission['SUBCLASS'] = test_predict

In [193]:
submission['SUBCLASS'].value_counts()

SUBCLASS
BRCA      732
STES      387
KIPAN     329
GBMLGG    254
HNSC      120
SKCM      111
THCA       83
PRAD       81
OV         80
LIHC       62
COAD       57
UCEC       51
LUAD       46
SARC       38
CESC       28
TGCT       22
LUSC       16
ACC        13
PAAD       11
LAML       11
PCPG        6
LGG         4
BLCA        3
THYM        1
Name: count, dtype: int64

In [194]:
submission.to_csv("../data/submission/submission_14.csv", index=False)