In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
# 데이터 불러오기
data = pd.read_csv('./otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [18]:
# 의미 없는 변수 제거
new_data = data.drop(['id'], axis = 1, inplace = False)

In [19]:
# 타겟 변수 인코딩
mapping_dict = {'Class_1' : 1,
               'Class_2' : 2,
               'Class_3' : 3,
               'Class_4' : 4,
               'Class_5' : 5,
               'Class_6' : 6,
               'Class_7' : 7,
               'Class_8' : 8,
               'Class_9' : 9
               }

In [20]:
after_mapping_target = new_data['target'].apply(lambda x : mapping_dict[x])

In [21]:
# 학습, 검증 데이터 분할

feature_columns = list(new_data.columns.difference(['target']))
X = new_data[feature_columns]
y = after_mapping_target
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


# 1. XGBoost

In [7]:
# !pip install xgboost

Collecting xgboost
  Downloading xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


In [8]:
import xgboost as xgb
import time

In [11]:
start = time.time()
xgb_dtrain = xgb.DMatrix(data=train_x, label=train_y)  # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data=test_x)  # 평가 데이터를 XGBoost 모델에 맞게 변환
xgb_param = {
    'max_depth' : 10,
    'learning_rate' : 0.01,
    'objective' : 'multi:softmax',  # 목적 함수
    'num_class' : len(set(train_y)) + 1  # 1보다 큰 값이어야 한다.
}
xgb_model = xgb.train(params=xgb_param, dtrain=xgb_dtrain, num_boost_round=100)  # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest)  # 평가 데이터 예측
print('Accuracy: %2f' % (accuracy_score(test_y, xgb_model_predict) * 100), '%')
print("Time: %2f" % (time.time() - start), 'seconds')

Accuracy: 78.539108 %
Time: 144.000236 seconds


In [12]:
xgb_model_predict

array([5., 7., 6., ..., 9., 2., 7.], dtype=float32)

# 2. LightGBM

In [13]:
# !pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


In [13]:
import lightgbm as lgb
import time

In [24]:
start = time.time()
lgb_dtrain = lgb.Dataset(data=train_x, label=train_y)  # 학습 데이터를 XGBoost 모델에 맞게 변환
lgb_param = {
    'max_depth' : 10,
    'learning_rate' : 0.01,
    'objective' : 'multiclass',  # 목적 함수
    'num_class' : len(set(train_y)) + 1  # 1보다 큰 값이어야 한다.
#     'num_class' : len(set(train_y)) # Label must be in [0, 9), but found 9 in label
}
lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain, num_boost_round=100)  # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis=1)  # 평가 데이터 예측
print('Accuracy: %2f' % (accuracy_score(test_y, lgb_model_predict) * 100), '%')
print("Time: %2f" % (time.time() - start), 'seconds')
print(lgb_model.predict(test_x))
print(lgb_model.predict(test_x).shape)

Accuracy: 73.569813 %
Time: 7.330419 seconds
[[9.14195396e-16 2.27101123e-02 3.85432853e-01 ... 3.25763637e-02
  7.61804989e-02 4.64683926e-02]
 [1.16264426e-15 3.77262272e-02 2.26739830e-01 ... 1.92812290e-01
  1.01826669e-01 8.11988908e-02]
 [8.03061518e-16 1.74144816e-02 1.18616633e-01 ... 2.57008295e-02
  6.69196402e-02 4.08194769e-02]
 ...
 [8.35633463e-16 5.03899675e-02 1.67414994e-01 ... 4.81925245e-02
  1.02188157e-01 3.95359380e-01]
 [1.01560525e-15 2.13472780e-02 4.60509516e-01 ... 3.32864538e-02
  8.46310485e-02 5.16230380e-02]
 [8.92775603e-16 2.04756016e-02 1.31867900e-01 ... 4.13697353e-01
  1.44381292e-01 4.53796283e-02]]
(12376, 10)


# 3. Catboost

In [7]:
# !pip install catboost

Collecting catboost
  Downloading catboost-0.24-cp37-none-win_amd64.whl (65.1 MB)
Collecting plotly
  Downloading plotly-4.9.0-py2.py3-none-any.whl (12.9 MB)
Processing c:\users\user\appdata\local\pip\cache\wheels\d7\a9\33\acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c\retrying-1.3.3-cp37-none-any.whl
Installing collected packages: retrying, plotly, catboost
Successfully installed catboost-0.24 plotly-4.9.0 retrying-1.3.3


In [10]:
import catboost as cb
import time

In [12]:
start = time.time()
cb_dtrain = cb.Pool(data=train_x, label=train_y)  # 학습 데이터를 XGBoost 모델에 맞게 변환
cb_param = {
    'max_depth' : 10,
    'learning_rate' : 0.01,
    'eval_metric' : 'Accuracy',
    'loss_function' : 'MultiClass',  # 손실 함수
}
cb_model = cb.train(params=cb_param, pool=cb_dtrain, num_boost_round=100)  # 학습 진행
cb_model_predict = np.argmax(cb_model.predict(test_x), axis=1) + 1 # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print('Accuracy: %2f' % (accuracy_score(test_y, cb_model_predict) * 100), '%')
print("Time: %2f" % (time.time() - start), 'seconds')
print(cb_model.predict(test_x))
print(cb_model.predict(test_x).shape)

0:	learn: 0.5907034	total: 829ms	remaining: 1m 22s
1:	learn: 0.6356107	total: 1.64s	remaining: 1m 20s
2:	learn: 0.6411256	total: 2.37s	remaining: 1m 16s
3:	learn: 0.6480344	total: 3.1s	remaining: 1m 14s
4:	learn: 0.6508222	total: 3.83s	remaining: 1m 12s
5:	learn: 0.6499939	total: 4.57s	remaining: 1m 11s
6:	learn: 0.6507818	total: 5.34s	remaining: 1m 10s
7:	learn: 0.6548422	total: 6.09s	remaining: 1m 10s
8:	learn: 0.6559533	total: 6.81s	remaining: 1m 8s
9:	learn: 0.6560947	total: 7.55s	remaining: 1m 7s
10:	learn: 0.6568421	total: 8.34s	remaining: 1m 7s
11:	learn: 0.6588219	total: 9.14s	remaining: 1m 7s
12:	learn: 0.6592259	total: 9.89s	remaining: 1m 6s
13:	learn: 0.6611248	total: 10.6s	remaining: 1m 5s
14:	learn: 0.6625591	total: 11.4s	remaining: 1m 4s
15:	learn: 0.6631853	total: 12.2s	remaining: 1m 3s
16:	learn: 0.6639328	total: 13s	remaining: 1m 3s
17:	learn: 0.6668821	total: 13.8s	remaining: 1m 2s
18:	learn: 0.6669630	total: 14.5s	remaining: 1m 1s
19:	learn: 0.6675286	total: 15.2s	re

# 4. Ensemble의 ensemble

In [25]:
import random
from sklearn.metrics import mean_squared_error

bagging_predict_result = list()
data_index = [data_index for data_index in range(train_x.shape[0])]
for _ in range(10):
    random_data_index = np.random.choice(data_index, train_x.shape[0])  # 복원 추출
    lgb_dtrain = lgb.Dataset(data=train_x.iloc[random_data_index], label=train_y.iloc[random_data_index])  # 학습 데이터를 XGBoost 모델에 맞게 변환
    lgb_param = {
        'max_depth' : 10,
        'learning_rate' : 0.01,
        'objective' : 'multiclass',  # 목적 함수
        'num_class' : len(set(train_y)) + 1  # 1보다 큰 값이어야 한다.
    }
    lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain, num_boost_round=100)  # 학습 진행
    
    pred = lgb_model.predict(test_x)    
    bagging_predict_result.append(pred)
# print(bagging_predict_result)   # 각 클래스에 대한 확률 값

# Soft Voting 방식을 활용한 다중 클래스 분류 예측 실시
# max_prob_idx_list = list()
# for row_idx in len(bagging_predict_result.shape[1]):
    
    

[array([[9.01088818e-16, 2.03405543e-02, 3.66139602e-01, ...,
        3.22851739e-02, 7.52512255e-02, 4.57880726e-02],
       [1.00911072e-15, 3.05391445e-02, 2.18727083e-01, ...,
        2.93475530e-01, 8.42722898e-02, 6.33445823e-02],
       [7.97931177e-16, 1.77992994e-02, 1.16744305e-01, ...,
        2.67466160e-02, 6.66363822e-02, 4.05462036e-02],
       ...,
       [8.08403417e-16, 3.82042936e-02, 1.69560404e-01, ...,
        3.65186592e-02, 1.31099539e-01, 3.94328260e-01],
       [9.94137557e-16, 2.07685080e-02, 4.48767475e-01, ...,
        3.37931010e-02, 8.30218597e-02, 5.05162662e-02],
       [8.66917696e-16, 1.96643564e-02, 1.26837636e-01, ...,
        4.26330525e-01, 1.42368542e-01, 4.40516957e-02]]), array([[9.54627556e-16, 2.08635359e-02, 3.51557847e-01, ...,
        3.45963257e-02, 8.00576808e-02, 4.85740328e-02],
       [1.04189449e-15, 2.99537475e-02, 2.17093608e-01, ...,
        2.59758239e-01, 9.04618339e-02, 6.65619409e-02],
       [8.03917948e-16, 1.73942585e-02, 1

AttributeError: 'list' object has no attribute 'shape'