In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [3]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [5]:
train[test.columns] = train[test.columns].map(lambda x: 0 if x == 'WT' else 1)
test[test.columns] = test[test.columns].map(lambda x: 0 if x == 'WT' else 1)

---

In [6]:
X = train[test.columns]
y = train['SUBCLASS']

In [7]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1000)
X_train_reduced = svd.fit_transform(X)

---

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X_train_reduced, y, test_size=0.3, random_state=42)

# Xgboost

In [10]:
import xgboost as xgb

In [11]:
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dtest = xgb.DMatrix(data = x_test, label = y_test)

In [12]:
params = {
    'objective': 'multi:softmax',  # 다중 분류
    'num_class': 26,               # 클래스 수
    'eval_metric': 'mlogloss',     # 평가 지표
    'seed' : 42,
    'alpha': 1.5,  # L1 정규화
    'lambda': 2,  # L2 정규화,
    'max_depth': 3,  # 트리 깊이 줄이기
    'learning_rate': 0.01,  # 학습률 조정
}

In [13]:
wlist = [(dtrain, "train"), (dtest, "eval")]

xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 25,
                      evals = wlist, early_stopping_rounds=10)

[0]	train-mlogloss:3.23498	eval-mlogloss:3.23879
[1]	train-mlogloss:3.21298	eval-mlogloss:3.22053
[2]	train-mlogloss:3.19193	eval-mlogloss:3.20344
[3]	train-mlogloss:3.17154	eval-mlogloss:3.18683
[4]	train-mlogloss:3.15192	eval-mlogloss:3.17096
[5]	train-mlogloss:3.13283	eval-mlogloss:3.15560
[6]	train-mlogloss:3.11449	eval-mlogloss:3.14051
[7]	train-mlogloss:3.09675	eval-mlogloss:3.12633
[8]	train-mlogloss:3.07955	eval-mlogloss:3.11226
[9]	train-mlogloss:3.06283	eval-mlogloss:3.09882
[10]	train-mlogloss:3.04660	eval-mlogloss:3.08595
[11]	train-mlogloss:3.03077	eval-mlogloss:3.07320
[12]	train-mlogloss:3.01548	eval-mlogloss:3.06071
[13]	train-mlogloss:3.00056	eval-mlogloss:3.04871
[14]	train-mlogloss:2.98584	eval-mlogloss:3.03675
[15]	train-mlogloss:2.97131	eval-mlogloss:3.02514
[16]	train-mlogloss:2.95722	eval-mlogloss:3.01386
[17]	train-mlogloss:2.94348	eval-mlogloss:3.00305
[18]	train-mlogloss:2.93000	eval-mlogloss:2.99223
[19]	train-mlogloss:2.91672	eval-mlogloss:2.98195
[20]	train

In [15]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(y_train, xgb_model.predict(dtrain))

0.4094470046082949

In [18]:
accuracy_score(y_test, xgb_model.predict(dtest))

0.3202579258463192

In [21]:
test_reducedsvd = svd.transform(test)

In [25]:
test_reducedsvd = xgb.DMatrix(data=test_reducedsvd)

In [30]:
test_predict = xgb_model.predict(test_reducedsvd).astype('int')

In [31]:
test_predict = le.inverse_transform(test_predict)

In [32]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [33]:
submission['SUBCLASS'] = test_predict

In [34]:
submission.to_csv("../data/submission/submission_09.csv", index=False)