In [19]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.svm import LinearSVR, SVR
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
kf = KFold(n_splits = 15, shuffle = True, random_state = 42)

#### LeaveOneOut을 사용했지만 KFold 보다 성능이 좋지 않아 15Fold를 사용했습니다.

In [3]:
train = pd.read_csv("iris_train.csv")
test = pd.read_csv("iris_test.csv")

In [4]:
train.head()

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,setosa,4.4,1.4,2.9,0.2
1,1,versicolor,6.4,4.5,3.2,1.5
2,2,virginica,6.2,4.8,2.8,1.8
3,3,virginica,7.2,6.1,3.6,2.5
4,4,setosa,4.9,1.4,3.0,0.2


In [5]:
test.head()

Unnamed: 0,id,species,sepal length (cm),petal length (cm)
0,0,setosa,5.4,1.7
1,1,setosa,5.7,1.5
2,2,setosa,5.3,1.5
3,3,setosa,5.1,1.9
4,4,virginica,6.0,4.8


In [6]:
scaler = MinMaxScaler()

#### Scaling 기법 중 MinMaxScaling이 성능이 좋아 사용했습니다.
- 학습 데이터의 분포를 학습한 scaler를 적용하기 위해 test 데이터에 대해서는 transform만 취해줍니다.

In [7]:
train.iloc[:, 2:4] = scaler.fit_transform(train.iloc[:, 2:4])
test.iloc[:, 2:] = scaler.transform(test.iloc[:, 2:])

In [8]:
train['sw'] = train['species'].map({'setosa' : 3, 'versicolor' : 1, 'virginica' : 2})
test['sw'] = test['species'].map({'setosa' : 3, 'versicolor' : 1, 'virginica' : 2})

#### Sepal width (cm)을 예측할 때 target encoding을 위해 species별 sepal width (cm)의 평균을 기준으로 랭크를 생성했습니다.

In [9]:
train['pw'] = train['species'].map({'setosa' : 3, 'versicolor' : 2, 'virginica' : 1})
test['pw'] = test['species'].map({'setosa' : 3, 'versicolor' : 2, 'virginica' : 1})

#### Petal width (cm)을 예측할 때 target encoding을 위해 species별 petal width (cm)의 평균을 기준으로 랭크를 생성했습니다.

In [10]:
X1 = train.drop(['id', 'species', 'sepal width (cm)', 'petal width (cm)', 'pw'], axis = 1)
X2 = train.drop(['id', 'species', 'sepal width (cm)', 'petal width (cm)', 'sw'], axis = 1)
y1 = train['sepal width (cm)']
y2 = train['petal width (cm)']

target1 = test[X1.columns]
target2 = test[X2.columns]

#### 최종적으로 2개의 데이터셋을 구성했습니다.
- sepal width (cm) 용
- petal width (cm) 용
***
### Linear Regression

In [12]:
lr_pred1 = np.zeros((target1.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X1, y1))) :
    
    tr_x, tr_y = X1.iloc[idx[0][0]], y1.iloc[idx[0][0]]
    val_x, val_y = X1.iloc[idx[0][1]], y1.iloc[idx[0][1]]
    
    ### fitting
    lr = LinearRegression(normalize = True)
    lr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = lr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)
    
    ### prediction
    fold_pred = lr.predict(target1) / 15
    lr_pred1 += fold_pred
    
print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.23283224802305144


In [13]:
lr_pred2 = np.zeros((target2.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X2, y2))) :
    
    tr_x, tr_y = X2.iloc[idx[0][0]], y2.iloc[idx[0][0]]
    val_x, val_y = X2.iloc[idx[0][1]], y2.iloc[idx[0][1]]
    
    ### fitting
    lr = LinearRegression()
    lr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = lr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)
    
    ### prediction
    fold_pred = lr.predict(target2) / 15
    lr_pred2 += fold_pred
    
print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.14261988003428838


***
### SVR

In [14]:
svr_pred1 = np.zeros((target1.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X1, y1))) :

    tr_x, tr_y = X1.iloc[idx[0][0]], y1.iloc[idx[0][0]]
    val_x, val_y = X1.iloc[idx[0][1]], y1.iloc[idx[0][1]]
    
    ### fitting
    svr = SVR(C = 5)
    svr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = svr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)
    
    ### prediction
    fold_pred = svr.predict(target1) / 15
    svr_pred1 += fold_pred

print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.2142095202162351


In [17]:
svr_pred2 = np.zeros((target2.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X2, y2))) :
    tr_x, tr_y = X2.iloc[idx[0][0]], y2.iloc[idx[0][0]]
    val_x, val_y = X2.iloc[idx[0][1]], y2.iloc[idx[0][1]]
    
    ### fitting
    svr = SVR(C = 5)
    svr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = svr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)
    
    ### prediction
    fold_pred = svr.predict(target2) / 15
    svr_pred2 += fold_pred

print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.13891373747298283


***
### ExtraTreeRegressor

In [20]:
etr_pred1 = np.zeros((target1.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X1, y1))) :

    tr_x, tr_y = X1.iloc[idx[0][0]], y1.iloc[idx[0][0]]
    val_x, val_y = X1.iloc[idx[0][1]], y1.iloc[idx[0][1]]
    
    ### fitting
    etr = ExtraTreesRegressor(random_state = 42, n_estimators = 15, max_depth = 3)
    etr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = etr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)

    
    ### prediction
    fold_pred = etr.predict(target1) / 15
    etr_pred1 += fold_pred

print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.23058993733356375


In [21]:
etr_pred2 = np.zeros((target2.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X2, y2))) :

    tr_x, tr_y = X2.iloc[idx[0][0]], y2.iloc[idx[0][0]]
    val_x, val_y = X2.iloc[idx[0][1]], y2.iloc[idx[0][1]]
    
    ###fitting
    etr = ExtraTreesRegressor(random_state = 42, n_estimators = 15, max_depth = 3)
    etr.fit(tr_x, tr_y)
    
    ### validation
    val_pred = etr.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)

    
    ### prediction
    fold_pred = etr.predict(target2) / 15
    etr_pred2 += fold_pred

print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

15 FOLD 평균 MAE = 0.1462207498576399


***

### CatBoost

In [22]:
cb_pred1 = np.zeros((target1.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X1, y1))) :

    tr_x, tr_y = X1.iloc[idx[0][0]], y1.iloc[idx[0][0]]
    val_x, val_y = X1.iloc[idx[0][1]], y1.iloc[idx[0][1]]
    
    cb = CatBoostRegressor(iterations = 5000, learning_rate = 0.1, max_depth = 3, loss_function = 'MAE')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 500, verbose = 1000)
    
    ### validation
    val_pred = cb.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)

    ### prediction
    fold_pred = cb.predict(target1) / 15
    cb_pred1 += fold_pred

print(f'15 FOLD 평균 MAE = {np.mean(vm_list)}')

0:	learn: 0.2748562	test: 0.2748562	test1: 0.3279981	best: 0.3279981 (0)	total: 61.5ms	remaining: 5m 7s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.2041686452
bestIteration = 87

Shrink model to first 88 iterations.
0:	learn: 0.2755704	test: 0.2755704	test1: 0.3139988	best: 0.3139988 (0)	total: 104us	remaining: 522ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.2333379438
bestIteration = 34

Shrink model to first 35 iterations.
0:	learn: 0.2877132	test: 0.2877132	test1: 0.1719992	best: 0.1719992 (0)	total: 70us	remaining: 351ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.0880096839
bestIteration = 91

Shrink model to first 92 iterations.
0:	learn: 0.2839989	test: 0.2839989	test1: 0.2619991	best: 0.2619991 (0)	total: 102us	remaining: 512ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.2144369068
bestIteration = 24

Shrink model to first 25 iterations.
0:	learn: 0.2822847	test: 0.2822847	test

In [23]:
cb_pred2 = np.zeros((target2.shape[0]))
vm_list = []
for i, idx in enumerate(zip(kf.split(X2, y2))) :

    tr_x, tr_y = X2.iloc[idx[0][0]], y2.iloc[idx[0][0]]
    val_x, val_y = X2.iloc[idx[0][1]], y2.iloc[idx[0][1]]
    
    ### fitting
    cb = CatBoostRegressor(iterations = 5000, learning_rate = 0.2, max_depth = 3, loss_function = 'MAE')
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 500, verbose = 1000)
    
    ### validation
    val_pred = cb.predict(val_x)
    val_mae = mean_absolute_error(val_y, val_pred)
    vm_list.append(val_mae)

    ### prediction
    fold_pred = cb.predict(target2) / 15
    cb_pred2 += fold_pred

print(f'5 FOLD 평균 MAE = {np.mean(vm_list)}')

0:	learn: 0.4625704	test: 0.4625704	test1: 0.7239986	best: 0.7239986 (0)	total: 365us	remaining: 1.83s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.07832421311
bestIteration = 23

Shrink model to first 24 iterations.
0:	learn: 0.4634277	test: 0.4634277	test1: 0.6199996	best: 0.6199996 (0)	total: 82us	remaining: 414ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.1945763624
bestIteration = 28

Shrink model to first 29 iterations.
0:	learn: 0.4737134	test: 0.4737134	test1: 0.3599992	best: 0.3599992 (0)	total: 79us	remaining: 398ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.04103778319
bestIteration = 208

Shrink model to first 209 iterations.
0:	learn: 0.4808564	test: 0.4808564	test1: 0.2600000	best: 0.2600000 (0)	total: 81us	remaining: 407ms
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.1408439539
bestIteration = 60

Shrink model to first 61 iterations.
0:	learn: 0.4911419	test: 0.4911419	tes

***
### DNN

- KFold를 통한 앙상블
- 각 Fold별 best 모델 저장 및 test 예측 데이터에 load 후 사용
- callbacks 활용

In [None]:
train = pd.read_csv("iris_train.csv")
test = pd.read_csv("iris_test.csv")

In [None]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [None]:
X = train.drop(['id', 'sepal width (cm)', 'petal width (cm)'], axis = 1)
y1 = train['sepal width (cm)']
y2 = train['petal width (cm)']

target = test[X.columns]

In [None]:
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
target = pd.DataFrame(scaler.transform(target), columns = X.columns)

In [None]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
dnn_pred1 = np.zeros((target.shape[0]))

In [None]:
for tr_idx, val_idx in kf.split(X, y1) :
    tr_x, tr_y = X.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y1.iloc[val_idx]

    model = Sequential()
    model.add(Dense(512, activation = 'swish'))
    model.add(Dense(256, activation = 'swish'))
    model.add(Dropout(.2))
    model.add(Dense(128, activation = 'swish'))
    model.add(Dense(64, activation = 'swish'))
    model.add(Dropout(.2))
    model.add(Dense(8, activation = 'swish'))
    model.add(Dense(1, activation = 'linear'))

    model.compile(optimizer = Adam(learning_rate = .0003), loss = 'mse')

    es = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 300, verbose = 100)
    mc = ModelCheckpoint('best_dnn.h5', mode = 'min', monitor = 'val_loss', save_best_only = True, verbose = 100)

    history = model.fit(tr_x, tr_y, epochs = 25000, verbose = 100, batch_size = 16, callbacks = [es, mc], validation_data = (val_x, val_y))
    best_model = load_model('best_dnn.h5')

    fold_pred = best_model.predict(target) / 15

    dnn_pred1 += [p[0] for p in fold_pred]

In [None]:
dnn_pred2 = np.zeros((target.shape[0]))

In [None]:
for tr_idx, val_idx in kf.split(X, y2) :
    tr_x, tr_y = X.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y2.iloc[val_idx]

    model = Sequential()
    model.add(Dense(512, activation = 'swish'))
    model.add(Dense(256, activation = 'swish'))
    model.add(Dropout(.2))
    model.add(Dense(128, activation = 'swish'))
    model.add(Dense(64, activation = 'swish'))
    model.add(Dropout(.2))
    model.add(Dense(8, activation = 'swish'))
    model.add(Dense(1, activation = 'linear'))

    model.compile(optimizer = Adam(learning_rate = .0003), loss = 'mse')

    es = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 300, verbose = 100)
    mc = ModelCheckpoint('best_dnn.h5', mode = 'min', monitor = 'val_loss', save_best_only = True, verbose = 100)

    history = model.fit(tr_x, tr_y, epochs = 25000, verbose = 100, batch_size = 16, callbacks = [es, mc], validation_data = (val_x, val_y))
    best_model = load_model('best_dnn.h5')

    fold_pred = best_model.predict(target) / 15

    dnn_pred2 += [p[0] for p in fold_pred]

***
### Ensemble

In [15]:
ml_pred1 (etr_pred1 + lr_pred1 + svr_pred1 + cb_pred1) / 4
ml_pred2 = (etr_pred2 + lr_pred2 + svr_pred2 + cb_pred2) / 4

In [24]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['sepal width (cm)'] = (ml_pred1 + dnn_pred1) / 2
submission['petal width (cm)'] = (ml_pred2 + dnn_pred2) / 2

In [None]:
submission.to_csv("dnn_ensemble.csv", index = False)