In [119]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from tqdm import tqdm

In [120]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [121]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [122]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [123]:
train[test.columns] = train[test.columns].map(lambda x: 0 if x == 'WT' else x)
test[test.columns] = test[test.columns].map(lambda x: 0 if x == 'WT' else x)

In [124]:
train.head()

Unnamed: 0,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20,R895R,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
len(train.loc[:, 'A2M'].value_counts())

141

In [126]:
for i in tqdm(test.columns):
    if len(train.loc[:, i].value_counts()) == 1:
        train.drop(columns=i, axis=1, inplace=True)
        test.drop(columns=i, axis=1, inplace=True)
    elif len(test.loc[:, i].value_counts()) == 1:
        train.drop(columns=i, axis=1, inplace=True)
        test.drop(columns=i, axis=1, inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████| 4384/4384 [03:07<00:00, 23.32it/s]


train.to_csv("../data/preprocessing/train.csv", encoding='utf8', index=False)  
test.to_csv("../data/preprocessing/test.csv", encoding='utf8', index=False)

In [177]:
train = pd.read_csv("../data/preprocessing/train.csv")
test = pd.read_csv("../data/preprocessing/test.csv")

In [178]:
import re

def extract_numbers(s):
    # 입력값이 문자열이 아닐 경우 문자열로 변환
    if not isinstance(s, str):
        s = str(s)
    match = re.search(r'\d+', s)
    if match:
        return match.group()
    else:
        return None

In [179]:
for i in tqdm(test.columns):
    A = train.loc[:, i].str.split(expand=True).fillna(0)
    for j in range(len(A.columns)):
        A[j] = A[j].apply(extract_numbers)
    A = A.astype('int')
    
    train.loc[:, i] = A.sum(axis=1)

100%|██████████████████████████████████████████████████████████████████████████████| 4225/4225 [03:44<00:00, 18.85it/s]


In [180]:
for i in tqdm(test.columns):
    A = test.loc[:, i].str.split(expand=True).fillna(0)
    for j in range(len(A.columns)):
        A[j] = A[j].apply(extract_numbers)
    A = A.astype('int')
    
    test.loc[:, i] = A.sum(axis=1)

100%|██████████████████████████████████████████████████████████████████████████████| 4225/4225 [00:51<00:00, 82.21it/s]


In [190]:
train = train.astype('int')
test = test.astype('int')

train.to_csv("../data/preprocessing/train_001.csv", encoding='utf8', index=False)  
test.to_csv("../data/preprocessing/test_001.csv", encoding='utf8', index=False)

In [263]:
train = pd.read_csv("../data/preprocessing/train_001.csv")
test = pd.read_csv("../data/preprocessing/test_001.csv")

In [264]:
X = train[test.columns]
y = train['SUBCLASS']

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=400)
X_train_reduced = svd.fit_transform(X)

---

In [265]:
from sklearn.model_selection import train_test_split

In [266]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [267]:
import matplotlib.pyplot as plt

# Xgboost

In [268]:
import xgboost as xgb

In [269]:
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dtest = xgb.DMatrix(data = x_test, label = y_test)

In [278]:
params = {
    'objective': 'multi:softmax',  # 다중 분류
    'num_class': 26,               # 클래스 수
    'eval_metric': 'mlogloss',     # 평가 지표
    'seed' : 42,
    'alpha': 0.5,  # L1 정규화
    'lambda': 1.5,  # L2 정규화,
    'max_depth': 4,  # 트리 깊이 줄이기
    # 'learning_rate': 0.01,  # 학습률 조정
}

In [279]:
wlist = [(dtrain, "train"), (dtest, "eval")]

xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400,
                      evals = wlist, early_stopping_rounds=10)

[0]	train-mlogloss:2.67415	eval-mlogloss:2.76080
[1]	train-mlogloss:2.47218	eval-mlogloss:2.60565
[2]	train-mlogloss:2.33162	eval-mlogloss:2.50858
[3]	train-mlogloss:2.22572	eval-mlogloss:2.43550
[4]	train-mlogloss:2.13961	eval-mlogloss:2.38414
[5]	train-mlogloss:2.06800	eval-mlogloss:2.33987
[6]	train-mlogloss:2.00752	eval-mlogloss:2.30733
[7]	train-mlogloss:1.95437	eval-mlogloss:2.27812
[8]	train-mlogloss:1.90948	eval-mlogloss:2.25491
[9]	train-mlogloss:1.86886	eval-mlogloss:2.23608
[10]	train-mlogloss:1.83339	eval-mlogloss:2.21844
[11]	train-mlogloss:1.80010	eval-mlogloss:2.20175
[12]	train-mlogloss:1.77194	eval-mlogloss:2.19092
[13]	train-mlogloss:1.74441	eval-mlogloss:2.17982
[14]	train-mlogloss:1.71994	eval-mlogloss:2.16982
[15]	train-mlogloss:1.69866	eval-mlogloss:2.16167
[16]	train-mlogloss:1.67714	eval-mlogloss:2.15375
[17]	train-mlogloss:1.65661	eval-mlogloss:2.14568
[18]	train-mlogloss:1.63777	eval-mlogloss:2.14079
[19]	train-mlogloss:1.61924	eval-mlogloss:2.13677
[20]	train

In [280]:
from sklearn.metrics import accuracy_score

In [281]:
accuracy_score(y_train, xgb_model.predict(dtrain))

0.6986175115207374

In [282]:
accuracy_score(y_test, xgb_model.predict(dtest))

0.3530360021493821

---

In [284]:
test_DM = xgb.DMatrix(data=test)

In [285]:
test_predict = xgb_model.predict(test_DM).astype('int')

In [286]:
test_predict = le.inverse_transform(test_predict)

In [287]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [288]:
submission['SUBCLASS'] = test_predict

In [289]:
submission.to_csv("../data/submission/submission_10.csv", index=False)