# 데이터 불러오기 및 전처리

In [1]:
import pandas as pd
train_x_df = pd.read_csv('data/train_x_df.csv')
train_y_df = pd.read_csv('data/train_y_df.csv')
test_x_df = pd.read_csv('data/test_x_df.csv')
train_x_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,9,0.983614,0.983614,0.983128,0.983246,0.001334,10.650987,0.009855,0.000848,6.771755
1,0,1,9,0.983245,0.983612,0.982453,0.982693,0.001425,11.375689,0.016137,0.000697,5.565188
2,0,2,9,0.982694,0.983612,0.982403,0.983002,0.001542,12.301942,0.014166,0.000905,7.225459
3,0,3,9,0.983009,0.984848,0.983009,0.984486,0.00252,20.134695,0.021557,0.001171,9.353
4,0,4,9,0.984233,0.984606,0.983612,0.984164,0.002818,22.515448,0.021434,0.001799,14.372534


In [2]:
# 코인 인덱스 별로 volume이랑 trades min max scale
for coin_idx in train_x_df.coin_index.unique():
    for col in ['volume', 'trades']:
        min_val = train_x_df.loc[train_x_df.coin_index==coin_idx, col].min()
        max_val = train_x_df.loc[train_x_df.coin_index==coin_idx, col].max()
        train_x_df.loc[train_x_df.coin_index==coin_idx, col] = train_x_df.loc[train_x_df.coin_index==coin_idx, col].apply(lambda x: (x - min_val) / (max_val - min_val))
        test_x_df.loc[test_x_df.coin_index==coin_idx, col] = test_x_df.loc[test_x_df.coin_index==coin_idx, col].apply(lambda x: (x - min_val) / (max_val - min_val))

import numpy as np
def df2d_to_array3d(df_2d):
    feature_size = len(df_2d.columns[3:]) # 9
    time_size = len(df_2d.time.unique()) # x는 1380, y는 120
    sample_size = len(df_2d.sample_id.unique()) # train은 7661, test는 535
    array_3d = df_2d.iloc[:,3:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

train_x_array = df2d_to_array3d(train_x_df)
train_y_array = df2d_to_array3d(train_y_df)
test_x_array = df2d_to_array3d(test_x_df)

print(f'''
These shape stands for (sample_size, time_step, feature)
train_x_array {train_x_array.shape}
train_y_array {train_y_array.shape}
test_x_array {test_x_array.shape}
''')


These shape stands for (sample_size, time_step, feature)
train_x_array (7661, 1380, 9)
train_y_array (7661, 120, 9)
test_x_array (535, 1380, 9)



In [3]:
X = train_x_array[:, :, [0, 4, 6]]  # open, volume, trades
y = train_y_array[:, :, [3]]
X.shape, y.shape

((7661, 1380, 3), (7661, 120, 1))

## y를 argmax와 max로 데이터 정리

In [4]:
y_argmax = []
y_max = []
for i in range(len(y)):
    y_argmax.append(y[i].argmax())
    y_max.append(y[i].max())
y_argmax = np.array(y_argmax)
y_max = np.array(y_max)
y_argmax.shape, y_max.shape

((7661,), (7661,))

In [5]:
def get_avg_10(array):
    new_array = []
    for i in range(0, len(array), 10):
        new_array.append(array[i:i+10, :].mean(axis=0))
    return np.array(new_array).flatten()

X = np.array(list(map(get_avg_10, X)))
X.shape

(7661, 414)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_argmax_train, y_argmax_test, y_max_train, y_max_test = train_test_split(X, y_argmax, y_max, test_size=0.2, random_state=1)
X_train.shape, y_argmax_train.shape, y_max_train.shape

((6128, 414), (6128,), (6128,))

## y_max

In [9]:
%%time
# y_max
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
lgbm_max = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm_max = GridSearchCV(lgbm_max, params)
model_lgbm_max.fit(X_train, y_max_train)

NameError: name 'model_lgbm' is not defined

In [10]:
y_max_pred = model_lgbm_max.predict(X_test)

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
print('LGBM for y_max')
print('mse:', mse(y_max_test, y_max_pred))
print('r2:', r2(y_max_test, y_max_pred))
print('best_params: ', model_lgbm_max.best_params_)

LGBM for y_max
mse: 0.00017537568312495192
r2: 0.10313430652759226
best_params:  {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200}


## y_argmax

In [11]:
%%time
# y_argmax
lgbm_argmax = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm_argmax = GridSearchCV(lgbm_argmax, params)
model_lgbm_argmax.fit(X_train, y_argmax_train)
y_argmax_pred = model_lgbm_argmax.predict(X_test)
print('mse:', mse(y_argmax_test, y_argmax_pred))
print('r2:', r2(y_argmax_test, y_argmax_pred))
print('best_params: ', model_lgbm_argmax.best_params_)

mse: 1603.268273126283
r2: 0.013968007974931917
best_params:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
CPU times: user 33min 45s, sys: 33.4 s, total: 34min 18s
Wall time: 8min 57s


# 전체 데이터

In [6]:
X_test = test_x_array[:, :, [0, 4, 6]]
X_test = np.array(list(map(get_avg_10, X_test)))
X_test.shape

(535, 414)

In [8]:
from lightgbm import LGBMRegressor
final_lgbm_max = LGBMRegressor(learning_rate=0.01, max_depth=7, n_estimators=200)
final_lgbm_argmax = LGBMRegressor(learning_rate=0.01, max_depth=3, n_estimators=200)


final_lgbm_max.fit(X, y_max)
final_lgbm_argmax.fit(X, y_argmax)

y_max_pred = final_lgbm_max.predict(X_test)
y_argmax_pred = final_lgbm_argmax.predict(X_test)

# 제출 파일 만들기

In [9]:
submission = pd.DataFrame(np.zeros([X_test.shape[0],2], np.int64),
                columns = ['buy_quantity', 'sell_time'])
submission = submission.reset_index()
submission.columns = ['sample_id','buy_quantity', 'sell_time']

submission['sell_time'] = list(map(round, y_argmax_pred))
submission['buy_quantity'] = (y_max_pred > 1.005) * 1
submission.sample_id = submission.sample_id + test_x_df.sample_id.min()
submission.buy_quantity.value_counts()

1    535
Name: buy_quantity, dtype: int64

In [10]:
submission

Unnamed: 0,sample_id,buy_quantity,sell_time
0,7661,1,50
1,7662,1,53
2,7663,1,51
3,7664,1,63
4,7665,1,49
...,...,...,...
530,8191,1,57
531,8192,1,54
532,8193,1,47
533,8194,1,57


In [11]:
submission.to_csv('lgbm_submission_v3.csv', index = False)