In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [3]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [5]:
X = train[test.columns]
y = train['SUBCLASS']

In [6]:
import re

In [7]:
def extract_numbers(s):
    # 입력값이 문자열이 아닐 경우 문자열로 변환

    if not isinstance(s, str):
        s = str(s)
    match = re.match(r"([A-Za-z]+)(\d+)([A-Za-z]+)", s)
    if match:
        original = match.group(1)  # 첫 번째 그룹: 원래 아미노산
        position = match.group(2)  # 두 번째 그룹: 변이 위치
        new = match.group(3)       # 세 번째 그룹: 새로운 아미노산

        return [original, position, new]
    else:
        None

In [37]:
amino_acid_dict = {
    'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 
    'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'fs' : -1
}

def encode_sequence(sequence):
    return [amino_acid_dict[aa] for aa in sequence]

In [139]:
X = X.map(lambda x : 0 if x == 'WT' else x)

In [140]:
for i in tqdm(X.columns):
    X[i + '_original'] = 0
    X[i + '_position'] = 0
    X[i + '_new'] = 0

    A = X[i].apply(extract_numbers)
    A = A.fillna(0)

    for j in range(len(X)):
        try:
            X.loc[j, i + '_original'], X.loc[j, i + '_position'], X.loc[j, i + '_new'] = A.loc[j][0], A.loc[j][1], A.loc[j][2]
        except:
            pass

    X[i + '_position'] = X[i + '_position'].astype('int')
    X.drop(columns=i, axis=1, inplace=True)

100%|██████████████████████████████████████████████████████████| 4384/4384 [1:38:13<00:00,  1.34s/it]


X.to_csv("../data/preprocessing/X_002.csv", index=False, encoding='utf-8')

In [160]:
X = pd.read_csv("../data/preprocessing/X_002.csv")

In [161]:
X.head()

Unnamed: 0,A2M_original,A2M_position,A2M_new,AAAS_original,AAAS_position,AAAS_new,AADAT_original,AADAT_position,AADAT_new,AARS1_original,...,ZPBP_new,ZW10_original,ZW10_position,ZW10_new,ZWINT_original,ZWINT_position,ZWINT_new,ZYX_original,ZYX_position,ZYX_new
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,R,895,R,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
amino_acid_array = np.array([amino_acid_dict[aa] for aa in amino_acid_dict])

In [163]:
# 빠른 벡터화된 매핑 적용 (문자만 변환, 정수는 그대로 유지)
def encode_dataframe(df):
    def encode_cell(cell):
        # 문자열일 경우에만 amino_acid_dict 변환 적용
        if isinstance(cell, str):
            return amino_acid_dict.get(cell, cell)  # 해당하지 않는 경우 원래 값을 반환
        return cell  # 정수는 그대로 반환
    
    # applymap으로 셀별로 함수 적용
    return df.applymap(encode_cell)

In [164]:
X = encode_dataframe(X)

In [165]:
a = []

for i in tqdm(X.columns):
    try:
        X[i].astype('int')
    except:
        a.append(i)

100%|████████████████████████████████████████████████████████| 13152/13152 [00:02<00:00, 5229.61it/s]


In [166]:
len(a)

627

In [218]:
tmp = X[a]

In [224]:
for i in tqdm(tmp.columns):
    for j in range(len(tmp)):
        try:
            tmp.loc[j, i] = int(tmp.loc[j, i])
        except:
            tmp.loc[j, i] = sum(encode_sequence(list(tmp.loc[j,i].upper())))
            pass

100%|██████████████████████████████████████████████████████████████| 627/627 [07:24<00:00,  1.41it/s]


In [229]:
tmp = tmp.astype('int')

In [232]:
X[a] = tmp

In [241]:
X = X.astype('int')

X.to_csv("../data/preprocessing/X_003.csv", index=False, encoding='utf-8')

In [245]:
from sklearn.model_selection import train_test_split

In [246]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Xgboost

In [247]:
import xgboost as xgb

In [248]:
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dtest = xgb.DMatrix(data = x_test, label = y_test)

In [254]:
params = {
    'objective': 'multi:softmax',  # 다중 분류
    'num_class': 26,               # 클래스 수
    'eval_metric': 'mlogloss',     # 평가 지표
    'seed' : 42,
    'alpha': 0.1,  # L1 정규화
    'lambda': 1,  # L2 정규화,
    'max_depth': 4,  # 트리 깊이 줄이기
    'learning_rate': 0.1,  # 학습률 조정
}

In [255]:
wlist = [(dtrain, "train"), (dtest, "eval")]

xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400,
                      evals = wlist, early_stopping_rounds=10)

[0]	train-mlogloss:3.05580	eval-mlogloss:3.08886
[1]	train-mlogloss:2.92268	eval-mlogloss:2.98063
[2]	train-mlogloss:2.81794	eval-mlogloss:2.89984
[3]	train-mlogloss:2.73307	eval-mlogloss:2.83348
[4]	train-mlogloss:2.65898	eval-mlogloss:2.77814
[5]	train-mlogloss:2.59316	eval-mlogloss:2.73181
[6]	train-mlogloss:2.53566	eval-mlogloss:2.69070
[7]	train-mlogloss:2.48357	eval-mlogloss:2.65663
[8]	train-mlogloss:2.43646	eval-mlogloss:2.62594
[9]	train-mlogloss:2.39416	eval-mlogloss:2.59825
[10]	train-mlogloss:2.35417	eval-mlogloss:2.57227
[11]	train-mlogloss:2.31819	eval-mlogloss:2.54971
[12]	train-mlogloss:2.28347	eval-mlogloss:2.52948
[13]	train-mlogloss:2.25156	eval-mlogloss:2.51032
[14]	train-mlogloss:2.22233	eval-mlogloss:2.49266
[15]	train-mlogloss:2.19361	eval-mlogloss:2.47630
[16]	train-mlogloss:2.16710	eval-mlogloss:2.46100
[17]	train-mlogloss:2.14259	eval-mlogloss:2.44760
[18]	train-mlogloss:2.11931	eval-mlogloss:2.43485
[19]	train-mlogloss:2.09658	eval-mlogloss:2.42488
[20]	train

In [256]:
from sklearn.metrics import accuracy_score

In [257]:
accuracy_score(y_train, xgb_model.predict(dtrain))

0.7043778801843318

In [258]:
accuracy_score(y_test, xgb_model.predict(dtest))

0.31649650725416445

---

In [260]:
test = test.map(lambda x : 0 if x == 'WT' else x)

In [261]:
for i in tqdm(test.columns):
    test[i + '_original'] = 0
    test[i + '_position'] = 0
    test[i + '_new'] = 0

    A = test[i].apply(extract_numbers)
    A = A.fillna(0)

    for j in range(len(test)):
        try:
            test.loc[j, i + '_original'], test.loc[j, i + '_position'], test.loc[j, i + '_new'] = A.loc[j][0], A.loc[j][1], A.loc[j][2]
        except:
            pass

    test[i + '_position'] = test[i + '_position'].astype('int')
    test.drop(columns=i, axis=1, inplace=True)

100%|████████████████████████████████████████████████████████████| 4384/4384 [27:26<00:00,  2.66it/s]


In [262]:
test = encode_dataframe(test)

In [289]:
a = []

for i in tqdm(test.columns):
    try:
        test[i].astype('int')
    except:
        a.append(i)

100%|███████████████████████████████████████████████████████| 13152/13152 [00:00<00:00, 18406.11it/s]


In [290]:
len(a)

3764

In [291]:
tmp = test[a]

test.to_csv("../data/preprocessing/Y_003.csv", index=False, encoding='utf-8')

In [None]:
for i in tqdm(tmp.columns):
    for j in range(len(tmp)):
        try:
            tmp.loc[j, i] = int(tmp.loc[j, i])
        except:
            try :
                tmp.loc[j, i] = sum(encode_sequence(list(tmp.loc[j,i].upper())))
            except:
                tmp.loc[j, i] = -1
                pass
            pass

In [None]:
tmp = tmp.astype('int')

In [None]:
test[a] = tmp

In [None]:
test = test.astype('int')

In [None]:
test.to_csv("../data/preprocessing/Y_003.csv", index=False, encoding='utf-8')

In [None]:
test_DM = xgb.DMatrix(data=test)

In [None]:
test_predict = xgb_model.predict(test_DM).astype('int')

In [None]:
test_predict = le.inverse_transform(test_predict)

In [None]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [None]:
submission['SUBCLASS'] = test_predict

In [None]:
submission.to_csv("../data/submission/submission_12.csv", index=False)