# 데이터 불러오기

In [1]:
import pandas as pd
train_x_df = pd.read_csv('data/train_x_df.csv')
train_y_df = pd.read_csv('data/train_y_df.csv')
test_x_df = pd.read_csv('data/test_x_df.csv')
train_x_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,9,0.983614,0.983614,0.983128,0.983246,0.001334,10.650987,0.009855,0.000848,6.771755
1,0,1,9,0.983245,0.983612,0.982453,0.982693,0.001425,11.375689,0.016137,0.000697,5.565188
2,0,2,9,0.982694,0.983612,0.982403,0.983002,0.001542,12.301942,0.014166,0.000905,7.225459
3,0,3,9,0.983009,0.984848,0.983009,0.984486,0.00252,20.134695,0.021557,0.001171,9.353
4,0,4,9,0.984233,0.984606,0.983612,0.984164,0.002818,22.515448,0.021434,0.001799,14.372534


In [2]:
# 코인 인덱스 별로 volume이랑 trades min max scale
for coin_idx in train_x_df.coin_index.unique():
    for col in ['volume', 'trades']:
        min_val = train_x_df.loc[train_x_df.coin_index==coin_idx, col].min()
        max_val = train_x_df.loc[train_x_df.coin_index==coin_idx, col].max()
        train_x_df.loc[train_x_df.coin_index==coin_idx, col] = train_x_df.loc[train_x_df.coin_index==coin_idx, col].apply(lambda x: (x - min_val) / (max_val - min_val))
        test_x_df.loc[test_x_df.coin_index==coin_idx, col] = test_x_df.loc[test_x_df.coin_index==coin_idx, col].apply(lambda x: (x - min_val) / (max_val - min_val))

In [3]:
import numpy as np
def df2d_to_array3d(df_2d):
    feature_size = len(df_2d.columns[3:]) # 9
    time_size = len(df_2d.time.unique()) # x는 1380, y는 120
    sample_size = len(df_2d.sample_id.unique()) # train은 7661, test는 535
    array_3d = df_2d.iloc[:,3:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

train_x_array = df2d_to_array3d(train_x_df)
train_y_array = df2d_to_array3d(train_y_df)
test_x_array = df2d_to_array3d(test_x_df)

print(f'''
These shape stands for (sample_size, time_step, feature)
train_x_array {train_x_array.shape}
train_y_array {train_y_array.shape}
test_x_array {test_x_array.shape}
''')


These shape stands for (sample_size, time_step, feature)
train_x_array (7661, 1380, 9)
train_y_array (7661, 120, 9)
test_x_array (535, 1380, 9)



In [4]:
# X = train_x_array[:, :, [0, 4]]
# y = train_y_array[:, :, [0, 4]]
X = train_x_array[:, :, [0, 4, 6]]  # open, volume, trades
y = train_y_array[:, :, [3]]
X.shape, y.shape

((7661, 1380, 3), (7661, 120, 1))

In [5]:
y_argmax = []
y_max = []
for i in range(len(y)):
    y_argmax.append(y[i].argmax())
    y_max.append(y[i].max())
y_argmax = np.array(y_argmax)
y_max = np.array(y_max)
y_argmax.shape, y_max.shape

((7661,), (7661,))

In [6]:
def get_avg_10(array):
    new_array = []
    for i in range(0, len(array), 10):
        new_array.append(array[i:i+10, :].mean(axis=0))
    return np.array(new_array).flatten()

X = np.array(list(map(get_avg_10, X)))
X.shape

(7661, 414)

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_argmax_train, y_argmax_test, y_max_train, y_max_test = train_test_split(X, y_argmax, y_max, test_size=0.2, random_state=0)
X_train.shape, y_argmax_train.shape, y_max_train.shape

((6128, 414), (6128,), (6128,))

# 각종 회귀

## 택함받지 못한 백성

```python
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
```

```python
from sklearn.decomposition import PCA
pca = PCA(n_components=276)
pca.fit(X)
# pca.explained_variance_
X = pca.transform(X)
```

## Linear regression

선형회귀는 딱히 그리드서치할 게 없으므로 train_test_split 사용

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

NameError: name 'y_train' is not defined

## Ridge Regression

In [169]:
from sklearn.linear_model import RidgeCV
ri = RidgeCV(cv=5, alphas=np.logspace(-6, 6, 13))
ri.fit(X, y)

RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
        cv=5)

In [170]:
y_pred = ri.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1726.5273471325015
r2: 0.00849156606078616


## Lasso Regression

In [171]:
from sklearn.linear_model import LassoCV
la = LassoCV(cv=5, random_state=0, alphas=np.logspace(-6, 6, 13))
la.fit(X, y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


LassoCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
        cv=5, random_state=0)

In [172]:
y_pred = la.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1724.8951044199403
r2: 0.009428928807126868


## random forest

In [173]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=5)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=5)

In [174]:
y_pred = rf.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1724.3932187599962
r2: 0.00971715121236949


```python
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6, 10]
}
model_rf = GridSearchCV(rf, params)
model_rf.fit(X, y)
y_pred = model_rf.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## Adaboost

In [175]:
from sklearn.ensemble import AdaBoostRegressor
ad = AdaBoostRegressor()
ad.fit(X_train, y_train)

AdaBoostRegressor()

In [176]:
y_pred = ad.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1746.1692547979424
r2: -0.002788369435911875


```python
from sklearn.ensemble import AdaBoostRegressor
ad = AdaBoostRegressor()
params = {
    'n_estimators': [50, 100], 
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'loss': ['linear', 'square', 'exponential']
}
model_ad = GridSearchCV(ad, params)
model_ad.fit(X, y)
y_pred = model_ad.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## LightGBM

In [177]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

LGBMRegressor()

In [178]:
y_pred = lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1771.045396175346
r2: -0.01707421554227362


```python
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm = GridSearchCV(lgbm, params)
model_lgbm.fit(X, y)
y_pred = model_lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

In [44]:
y_argmax

array([ 13,  28, 104, ...,  64,  38,  44])

In [47]:
%%time
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
lgbm_max = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm = GridSearchCV(lgbm_max, params)
model_lgbm.fit(X_train, y_max_train)
y_pred = model_lgbm.predict(X_test)

NameError: name 'mse' is not defined

In [48]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
print('LGBM for y_max')
print('mse:', mse(y_max_test, y_pred))
print('r2:', r2(y_max_test, y_pred))

LGBM for y_max
mse: 0.000180139087574658
r2: 0.14141208579377873


In [54]:
model_lgbm.best_params_

{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200}

In [55]:
%%time
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
lgbm_argmax = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm_argmax = GridSearchCV(lgbm_argmax, params)
model_lgbm_argmax.fit(X_train, y_argmax_train)
y_pred_argmax = model_lgbm.predict(X_test)
print('LGBM for y_argmax')
print('mse:', mse(y_argmax_test, y_pred_argmax))
print('r2:', r2(y_argmax_test, y_pred_argmax))

LGBM for y_argmax
mse: 4589.220905411424
r2: -1.641102299487923
CPU times: user 33min 40s, sys: 38.4 s, total: 34min 19s
Wall time: 9min 4s


In [65]:
y_pred_argmax

array([1.01352669, 1.01194308, 1.0111374 , ..., 1.01685459, 1.01532848,
       1.00690499])

In [66]:
y_pred_max

NameError: name 'y_pred_max' is not defined

In [61]:
model_lgbm_argmax.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150}

In [59]:
np.mean(np.square(y_argmax_test - y_argmax_test.mean()))

1737.615731999937

## xgboost

In [179]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [180]:
y_pred = xgb.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1934.0980457834132
r2: -0.11071193146437586


```python
from xgboost import XGBRegressor
xgb = XGBRegressor()
params = {
    'booster': ['gbtree', 'gblinear', 'dart'], 
    'max_depth': [3, 5, 7], 
    'n_estimators': [100, 150, 200], 
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]}
model_xgb = GridSearchCV(xgb, params)
model_xgb.fit(X, y)
y_pred = model_xgb.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## SVR

In [181]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)

SVR()

In [182]:
y_pred = svr.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 1736.6987192324455
r2: 0.0026503604531380454


# Stacking

## argmax

In [65]:
X_train.shape, y_argmax_train.shape, X_test.shape, y_argmax_test.shape

((6128, 414), (6128,), (1533, 414), (1533,))

In [10]:
from sklearn.model_selection import KFold
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):
    kfold = KFold(n_splits=n_folds, shuffle=False)
    
    # 초기화
    train_fold_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test.shape[0], n_folds))
    print(f'model: {model.__class__.__name__}')
    
    for idx, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
        X_train_ = X_train[train_idx]
        y_train_ = y_train[train_idx]
        X_val_ = X_train[valid_idx]
        
        model.fit(X_train_, y_train_)
        
        train_fold_pred[valid_idx, :] = model.predict(X_val_).reshape(-1, 1)
        test_pred[:, idx] = model.predict(X_test)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean    

In [67]:
%%time
from sklearn.linear_model import Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor # 너무 오래 걸림 
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

ridge_train, ridge_test = get_stacking_data(Ridge(alpha=10), X_train, y_argmax_train, X_test)
lasso_train, lasso_test = get_stacking_data(Lasso(alpha=0.01), X_train, y_argmax_train, X_test)
adboost_train, adboost_test = get_stacking_data(AdaBoostRegressor(), X_train, y_argmax_train, X_test)
# rf_train, rf_test = get_stacking_data(RandomForestRegressor(), X_train, y_argmax_train, X_test)
svm_train, svm_test = get_stacking_data(SVR(), X_train, y_argmax_train, X_test)

model: Ridge
model: Lasso
model: AdaBoostRegressor
model: SVR
CPU times: user 2min 40s, sys: 1.78 s, total: 2min 42s
Wall time: 2min 43s


In [68]:
new_X_train = np.concatenate((
    ridge_train, 
    lasso_train, 
    adboost_train, 
#     rf_train, 
    svm_train
), axis=1)
new_X_test = np.concatenate((
    ridge_test, 
    lasso_test, 
    adboost_test, 
#     rf_test, 
    svm_test
), axis=1)
new_X_train.shape, new_X_test.shape

((6128, 4), (1533, 4))

In [69]:
from lightgbm import LGBMRegressor
lgbm_argmax = LGBMRegressor()
lgbm_argmax.fit(new_X_train, y_argmax_train)

LGBMRegressor()

In [70]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

y_pred = lgbm_argmax.predict(new_X_test)
print('mse:', mse(y_argmax_test, y_pred))
print('r2:', r2(y_argmax_test, y_pred))

mse: 1763.5207793849422
r2: -0.014908386766957715


## max

In [71]:
X_train.shape, y_max_train.shape, X_test.shape, y_max_test.shape

((6128, 414), (6128,), (1533, 414), (1533,))

In [72]:
%%time
from sklearn.linear_model import Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor # 너무 오래 걸림 
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

ridge_train, ridge_test = get_stacking_data(Ridge(alpha=10), X_train, y_max_train, X_test)
lasso_train, lasso_test = get_stacking_data(Lasso(alpha=0.01), X_train, y_max_train, X_test)
adboost_train, adboost_test = get_stacking_data(AdaBoostRegressor(), X_train, y_max_train, X_test)
# rf_train, rf_test = get_stacking_data(RandomForestRegressor(), X_train, y_max_train, X_test)
svm_train, svm_test = get_stacking_data(SVR(), X_train, y_max_train, X_test)

model: Ridge
model: Lasso
model: AdaBoostRegressor
model: SVR
CPU times: user 1min 31s, sys: 1.27 s, total: 1min 32s
Wall time: 1min 33s


In [73]:
max_X_train = np.concatenate((
    ridge_train, 
    lasso_train, 
    adboost_train, 
#     rf_train, 
    svm_train
), axis=1)
max_X_test = np.concatenate((
    ridge_test, 
    lasso_test, 
    adboost_test, 
#     rf_test, 
    svm_test
), axis=1)
max_X_train.shape, max_X_test.shape

((6128, 4), (1533, 4))

In [74]:
from lightgbm import LGBMRegressor
lgbm_max = LGBMRegressor()
lgbm_max.fit(max_X_train, y_max_train)

LGBMRegressor()

In [75]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

y_pred = lgbm_max.predict(max_X_test)
print('mse:', mse(y_max_test, y_pred))
print('r2:', r2(y_max_test, y_pred))

mse: 0.00021021411434051643
r2: -0.0019330085344735348


# 데이터 전체

In [8]:
X_test = test_x_array[:, :, [0, 4, 6]]
X_test = np.array(list(map(get_avg_10, X_test)))
X_test.shape

(535, 414)

In [11]:
%%time
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

ridge_train, ridge_test = get_stacking_data(Ridge(alpha=10), X, y_argmax, X_test)
lasso_train, lasso_test = get_stacking_data(Lasso(alpha=0.01), X, y_argmax, X_test)
adboost_train, adboost_test = get_stacking_data(AdaBoostRegressor(), X, y_argmax, X_test)
xgb_train, xgb_test = get_stacking_data(XGBRegressor(), X, y_argmax, X_test)
svm_train, svm_test = get_stacking_data(SVR(), X, y_argmax, X_test)

model: Ridge
model: Lasso
model: AdaBoostRegressor
model: XGBRegressor
model: SVR
CPU times: user 13min 53s, sys: 4.9 s, total: 13min 58s
Wall time: 6min 25s


In [13]:
argmax_X_train = np.concatenate((
    ridge_train, 
    lasso_train, 
    adboost_train, 
    xgb_train,
    svm_train
), axis=1)
argmax_X_test = np.concatenate((
    ridge_test, 
    lasso_test, 
    adboost_test, 
    xgb_test,
    svm_test
), axis=1)
argmax_X_train.shape, argmax_X_test.shape

((7661, 5), (535, 5))

In [14]:
from lightgbm import LGBMRegressor
lgbm_argmax = LGBMRegressor()
lgbm_argmax.fit(argmax_X_train, y_argmax)

LGBMRegressor()

In [15]:
y_argmax_pred = lgbm_argmax.predict(argmax_X_test)
y_argmax_pred.shape

(535,)

-------

In [16]:
%%time
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

ridge_train, ridge_test = get_stacking_data(Ridge(alpha=10), X, y_max, X_test)
lasso_train, lasso_test = get_stacking_data(Lasso(alpha=0.01), X, y_max, X_test)
adboost_train, adboost_test = get_stacking_data(AdaBoostRegressor(), X, y_max, X_test)
xgb_train, xgb_test = get_stacking_data(XGBRegressor(), X, y_max, X_test)
svm_train, svm_test = get_stacking_data(SVR(), X, y_max, X_test)

model: Ridge
model: Lasso
model: AdaBoostRegressor
model: XGBRegressor
model: SVR
CPU times: user 12min, sys: 4.05 s, total: 12min 5s
Wall time: 4min 51s


In [17]:
max_X_train = np.concatenate((
    ridge_train, 
    lasso_train, 
    adboost_train, 
    xgb_train, 
    svm_train
), axis=1)
max_X_test = np.concatenate((
    ridge_test, 
    lasso_test, 
    adboost_test, 
    xgb_test,
    svm_test
), axis=1)
max_X_train.shape, max_X_test.shape

((7661, 5), (535, 5))

In [18]:
from lightgbm import LGBMRegressor
lgbm_max = LGBMRegressor()
lgbm_max.fit(max_X_train, y_max)

LGBMRegressor()

In [19]:
y_max_pred = lgbm_max.predict(max_X_test)
y_max_pred.shape

(535,)

## 제출 파일 만들기

In [20]:
submission = pd.DataFrame(np.zeros([max_X_test.shape[0],2], np.int64),
                columns = ['buy_quantity', 'sell_time'])
submission = submission.reset_index()
submission.columns = ['sample_id','buy_quantity', 'sell_time']
submission

Unnamed: 0,sample_id,buy_quantity,sell_time
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
530,530,0,0
531,531,0,0
532,532,0,0
533,533,0,0


In [21]:
submission['sell_time'] = list(map(round, y_argmax_pred))
submission

Unnamed: 0,sample_id,buy_quantity,sell_time
0,0,0,45
1,1,0,52
2,2,0,44
3,3,0,63
4,4,0,54
...,...,...,...
530,530,0,52
531,531,0,55
532,532,0,35
533,533,0,62


In [34]:
submission['buy_quantity'] = (y_max_pred > 1.006) * 0.6 + (y_max_pred > 1.007) * 0.1 + (y_max_pred > 1.008) * 0.1 + (y_max_pred > 1.009) * 0.1 + (y_max_pred > 1.01) * 0.1
submission 


Unnamed: 0,sample_id,buy_quantity,sell_time
0,7661,1.0,45
1,7662,1.0,52
2,7663,1.0,44
3,7664,1.0,63
4,7665,1.0,54
...,...,...,...
530,8191,1.0,52
531,8192,1.0,55
532,8193,1.0,35
533,8194,0.7,62


In [35]:
submission.buy_quantity.value_counts()

1.0    284
0.9     71
0.7     65
0.6     63
0.8     29
0.0     23
Name: buy_quantity, dtype: int64

In [30]:
submission.sample_id = submission.sample_id + test_x_df.sample_id.min()
submission

Unnamed: 0,sample_id,buy_quantity,sell_time
0,7661,1.0,45
1,7662,1.0,52
2,7663,1.0,44
3,7664,1.0,63
4,7665,1.0,54
...,...,...,...
530,8191,1.0,52
531,8192,1.0,55
532,8193,1.0,35
533,8194,1.0,62


In [36]:
submission.to_csv('stacking_submission_v9.csv', index = False)

# LGBM 만 쓰기

In [None]:
from lightgbm import LGBMRegressor
lgbm_argmax = LGBMRegressor(learning_rate=0.01, n_estimators=150, max_depth=3)
lgbm_argmax.fit(X, y_argmax)
y_max_pred = model_lgbm.predict(X_test)

In [None]:
from lightgbm import LGBMRegressor
lgbm_max = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm = GridSearchCV(lgbm, params)
model_lgbm.fit(X, y)
y_pred = model_lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

In [37]:
X

array([[9.83714575e-01, 3.89541057e-03, 6.71736429e-03, ...,
        9.99277771e-01, 6.29109149e-03, 7.50503205e-03],
       [1.00522205e+00, 1.37291187e-03, 5.08052513e-03, ...,
        1.00050629e+00, 1.54793655e-03, 4.81135819e-03],
       [1.12741548e+00, 4.60765956e-04, 1.41516558e-03, ...,
        9.98829192e-01, 3.51746666e-03, 7.14877467e-03],
       ...,
       [9.79079294e-01, 6.61539208e-03, 5.29369107e-04, ...,
        9.98222405e-01, 2.12466078e-03, 4.57416018e-04],
       [1.03342366e+00, 4.31699744e-03, 4.25066467e-03, ...,
        9.99198461e-01, 1.87641573e-03, 3.65346380e-03],
       [1.01041365e+00, 6.38630003e-03, 1.61686914e-02, ...,
        1.00071533e+00, 6.40234083e-03, 2.24223431e-02]])

In [39]:
y_max

array([1.00142026, 1.00141406, 1.01211095, ..., 1.01071095, 1.00247216,
       1.01181102])