In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# 현재 경로 확인
os.getcwd()

'/Users/imchanghun/Documents/machine_learning'

In [3]:
# 데이터 불러오기
data = pd.read_csv("./data/otto_train.csv") # product category
data.head() # 데이터 확인

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [5]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print("nCar: %d" % nCar, " nVar: %d" % nVar)

nCar: 61878  nVar: 95


## 의미가 없다고 판단되는 변수 제거

In [6]:
data = data.drop(['id'], axis = 1) # id 제거

## 타겟 변수의 문자열을 숫자로 변환

In [7]:
mapping_dict = {"Class_1": 1,
               "Class_2": 2,
               "Class_3": 3,
               "Class_4": 4,
               "Class_5": 5,
               "Class_6": 6,
               "Class_7": 7,
               "Class_8": 8,
               "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

## 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [8]:
feature_columns = list(data.columns.difference(['target'])) # target을 제외한 모든 행
X = data[feature_columns] # 설명변수
y = after_mapping_target # 타겟변수
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2로 분할
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(49502, 93) (12376, 93) (49502,) (12376,)


## 1. XGBoost

In [9]:
# !pip install xgboost

In [10]:
import xgboost as xgb
import time
start = time.time() # 시작 시간 지정
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data = test_x)  # 평가 데이터를 XGBoost 모델에 맞게 변환
xgb_param = {"max_depth": 10, # 트리 깊이
            "learning_rate": 0.01, # Step Size
            "n_estimators": 100, # Number of trees, 트리의 개수
            "objective": "multi:softmax", # 목적 함수
            "num_class" : len(set(train_y)) + 1} # 파라미터 추가 , Label must be in [0, num_class) -> number_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 76.67 %
Time: 4.71 seconds


In [11]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

## 2. LightGBM

In [12]:
# !pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.0.0-py2.py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 829 kB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.0.0
You should consider upgrading via the '/Users/imchanghun/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [13]:
import lightgbm as lgb
start = time.time() # 시작 시간 지정
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LigthGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            "n_estimators": 100, # Number of trees, 트리 생성 개수
            "objective": "multiclass", # 목적 함수
            "num_class": len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0,num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params=lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis=1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "Seconds") # 코드 |실행 시간 계산



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 1.83 Seconds


In [14]:
lgb_model.predict(test_x)

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

## 3. Catboost

In [16]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.24.1-cp37-none-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (11.4 MB)
[K     |████████████████████████████████| 11.4 MB 711 kB/s eta 0:00:01
[?25hCollecting plotly
  Downloading plotly-4.11.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 8.1 MB/s eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.14.1-py2.py3-none-any.whl (18 kB)
Collecting retrying>=1.3.3
  Using cached retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=f2fe1ace109f79fb2a6739e8270dc289c14a7b520cf55e6fc12e354a6c75e930
  Stored in directory: /Users/imchanghun/Library/Caches/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: ret

In [17]:
import catboost as cb
start = time.time() # 시작 시간 지정
cb_dtrain = cb.Pool(data = train_x, label = train_y) # 학습 데이터를 Catboost 모델에 맞게 변환
cb_param = {"max_depth": 10, # 트리 깊이
           "learning_rate": 0.01, # Step Size
           "n_estimators": 100, # Number of trees, 트리 생성 개수
           "eval_metric": "Accuracy", # 평가 척도
           "loss_function": "MultiClass"} # 손실 함수, 목적 함수
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # 정확도 % rPtks
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

0:	learn: 0.5907034	total: 360ms	remaining: 35.6s
1:	learn: 0.6356107	total: 594ms	remaining: 29.1s
2:	learn: 0.6411256	total: 906ms	remaining: 29.3s
3:	learn: 0.6480344	total: 1.16s	remaining: 27.8s
4:	learn: 0.6508222	total: 1.44s	remaining: 27.4s
5:	learn: 0.6499939	total: 1.76s	remaining: 27.6s
6:	learn: 0.6507818	total: 2.11s	remaining: 28.1s
7:	learn: 0.6548422	total: 2.51s	remaining: 28.9s
8:	learn: 0.6559533	total: 2.84s	remaining: 28.7s
9:	learn: 0.6560947	total: 3.27s	remaining: 29.5s
10:	learn: 0.6568421	total: 3.55s	remaining: 28.7s
11:	learn: 0.6588219	total: 3.88s	remaining: 28.4s
12:	learn: 0.6592259	total: 4.16s	remaining: 27.8s
13:	learn: 0.6611248	total: 4.64s	remaining: 28.5s
14:	learn: 0.6625591	total: 4.98s	remaining: 28.2s
15:	learn: 0.6631853	total: 5.42s	remaining: 28.5s
16:	learn: 0.6639328	total: 5.71s	remaining: 27.9s
17:	learn: 0.6668821	total: 6.01s	remaining: 27.4s
18:	learn: 0.6669630	total: 6.41s	remaining: 27.3s
19:	learn: 0.6675286	total: 6.78s	remaini

In [18]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

In [19]:
# 데이터 불러오기
data = pd.read_csv("./data/kc_house_data.csv")
data.head() # 데이터 확인

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [20]:
data = data.drop(['id','date','zipcode','lat','long'], axis= 1) # id, date, zipcode, lat, long 제거

In [21]:
feature_columns = list(data.columns.difference(['price']))
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(15129, 8) (6484, 8) (15129,) (6484,)


In [22]:
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            "learning_rate": 0.01, # Step Size
            "n_estimators": 500, # Number of trees, 트리 생성 개수
            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666


In [23]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

210904.17249451784

## Ensemble의 Ensemble

In [25]:
import random
bagging_predict_result = [] # 빈 리스트 생성
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스를 리스트로 변환
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # 데이터의 1/10 크기만큼 랜덤 샘플링, //는 소수점을 무시하기 위함
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터를 LightGBM 모델에 맞게 변환
    lgb_param = {'max_depth': 14, # 트리 깊이
                'learning_rate': 0.01, # Step Size
                 'n_estimators': 500, # Number of trees, 트리 생성 개수
                 'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다
    lgb_model = lgb.train(params=lgb_param, train_set = lgb_dtrain) # 학습 진행
    predict1 = lgb_model.predict(test_x) # 테스트 데이터 예측
    bagging_predict_result.append(predict1)
    

9538
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 541371.137550




9528
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 535918.010906
9567
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 235
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 543280.162602
9688
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538976.934034
9581
You can set `force_row_wise=true` to remove the overhead.
And if 

In [26]:
bagging_predict_result

[array([502255.88524611, 668654.8345624 , 891936.81934501, ...,
        336425.72652745, 865785.06542248, 459020.31273736]),
 array([521103.37136853, 626580.69405982, 933027.50351642, ...,
        337612.50292701, 850386.40091157, 451720.48973972]),
 array([508237.08598847, 661298.37697849, 971284.39841148, ...,
        361992.26813021, 896544.53520603, 465507.50957056]),
 array([497321.56268039, 597943.05301243, 984751.04583033, ...,
        347505.67268301, 969788.88281171, 456599.85711046]),
 array([ 479535.87497738,  598943.66054933, 1008738.94579951, ...,
         333284.36390326,  964171.75657004,  482462.60616543]),
 array([507819.69787987, 634143.6932939 , 967154.84726532, ...,
        333348.83636334, 891392.05275484, 475397.70274258]),
 array([507415.91418662, 647563.11029373, 978704.7969378 , ...,
        342272.33044608, 891408.97329989, 463173.21573961]),
 array([521227.14228919, 633994.51441139, 984911.79169643, ...,
        330756.55576194, 880439.54657869, 472348.363302

In [28]:
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)):  # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [29]:
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이터의 타겟변수와 비교하여 성능 평가

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 210682.699009588


In [30]:
bagging_predict

[509147.1007773053,
 633232.4875692094,
 956674.1206549844,
 1643963.8795113303,
 644435.0893324817,
 368328.28812954423,
 696320.9292997416,
 435851.3671626147,
 462703.8790795698,
 488839.1152137094,
 631678.5839093649,
 392210.80990142433,
 301444.5792662418,
 358317.1669741544,
 338410.8506648115,
 1313422.36679262,
 357143.5096889053,
 1022856.2881740012,
 318749.62461327895,
 522173.15029920964,
 375742.25705323403,
 1843315.2454337825,
 663868.4833597793,
 538751.9279366222,
 504819.27282307856,
 487723.03219143173,
 299261.9705630215,
 257205.21647815648,
 480939.14672892186,
 539223.5184532373,
 487047.38749407337,
 476615.8365943123,
 459335.88381398114,
 574702.2681337569,
 378111.99166699185,
 1035833.541072169,
 889128.5620355582,
 520636.4078182505,
 355566.60268343793,
 1495381.9438899937,
 396307.9718521495,
 274978.5195188256,
 513058.2040646197,
 340129.182571915,
 255089.7265422743,
 243355.63448639648,
 329983.6280255836,
 332404.8044288701,
 353518.71809674177,
 56