In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm

In [92]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [93]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [94]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [5]:
X = train[test.columns]
y = train['SUBCLASS']

---

In [6]:
import hashlib

In [7]:
# 해시 함수 정의
def hash_variation(variation):
    # 문자열을 바이트로 인코딩
    encoded = variation.encode()
    # hashlib으로 md5 해시 생성
    hash_object = hashlib.md5(encoded)
    # 해시값을 16진수 형태로 반환
    return hash_object.hexdigest()

In [8]:
for i in tqdm(X.columns):
    X[i] = X[i].apply(hash_variation)
    X[i] = X[i].apply(lambda x: np.mean([ord(char) for char in x]))

100%|██████████████████████████████████████████████████████████████████████████████| 4384/4384 [05:04<00:00, 14.40it/s]


X.to_csv("../data/preprocessing/X_001.csv", index=False)

In [33]:
X = pd.read_csv("../data/preprocessing/X_001.csv")

In [35]:
for i in tqdm(X.columns):
    X[i] = X[i].map(lambda x : 0 if x == 68.9375 else x)

100%|█████████████████████████████████████████████████████████████████████████████| 4384/4384 [00:07<00:00, 625.04it/s]


In [37]:
from sklearn.model_selection import train_test_split

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Xgboost

In [39]:
import xgboost as xgb

In [40]:
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dtest = xgb.DMatrix(data = x_test, label = y_test)

In [41]:
params = {
    'objective': 'multi:softmax',  # 다중 분류
    'num_class': 26,               # 클래스 수
    'eval_metric': 'mlogloss',     # 평가 지표
    'seed' : 42,
    'alpha': 0.1,  # L1 정규화
    'lambda': 1,  # L2 정규화,
    'max_depth': 4,  # 트리 깊이 줄이기
    'learning_rate': 0.1,  # 학습률 조정
}

In [42]:
wlist = [(dtrain, "train"), (dtest, "eval")]

xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400,
                      evals = wlist, early_stopping_rounds=10)

[0]	train-mlogloss:3.03879	eval-mlogloss:3.07514
[1]	train-mlogloss:2.89577	eval-mlogloss:2.95931
[2]	train-mlogloss:2.78560	eval-mlogloss:2.87193
[3]	train-mlogloss:2.69474	eval-mlogloss:2.80117
[4]	train-mlogloss:2.61558	eval-mlogloss:2.74330
[5]	train-mlogloss:2.54742	eval-mlogloss:2.69439
[6]	train-mlogloss:2.48746	eval-mlogloss:2.64915
[7]	train-mlogloss:2.43377	eval-mlogloss:2.61040
[8]	train-mlogloss:2.38520	eval-mlogloss:2.57598
[9]	train-mlogloss:2.34074	eval-mlogloss:2.54536
[10]	train-mlogloss:2.30045	eval-mlogloss:2.51900
[11]	train-mlogloss:2.26258	eval-mlogloss:2.49456
[12]	train-mlogloss:2.22834	eval-mlogloss:2.47247
[13]	train-mlogloss:2.19553	eval-mlogloss:2.45042
[14]	train-mlogloss:2.16505	eval-mlogloss:2.43320
[15]	train-mlogloss:2.13678	eval-mlogloss:2.41680
[16]	train-mlogloss:2.10985	eval-mlogloss:2.40102
[17]	train-mlogloss:2.08367	eval-mlogloss:2.38559
[18]	train-mlogloss:2.05996	eval-mlogloss:2.37206
[19]	train-mlogloss:2.03717	eval-mlogloss:2.35891
[20]	train

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
accuracy_score(y_train, xgb_model.predict(dtrain))

0.6988479262672811

In [45]:
accuracy_score(y_test, xgb_model.predict(dtest))

0.33422890918860826

---

In [95]:
for i in tqdm(test.columns):
    test[i] = test[i].astype('str')
    test[i] = test[i].apply(hash_variation)
    test[i] = test[i].apply(lambda x: np.mean([ord(char) for char in x]))

100%|██████████████████████████████████████████████████████████████████████████████| 4384/4384 [01:50<00:00, 39.68it/s]


In [98]:
for i in tqdm(X.columns):
    test[i] = test[i].map(lambda x : 0 if x == 68.9375 else x)

100%|████████████████████████████████████████████████████████████████████████████| 4384/4384 [00:03<00:00, 1396.88it/s]


In [101]:
test_DM = xgb.DMatrix(data=test)

In [104]:
test_predict = xgb_model.predict(test_DM).astype('int')

In [105]:
test_predict = le.inverse_transform(test_predict)

In [106]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [107]:
submission['SUBCLASS'] = test_predict

In [108]:
submission.to_csv("../data/submission/submission_11.csv", index=False)