In [107]:
import pandas as pd
train_x_df = pd.read_csv('data/train_x_df.csv')
train_y_df = pd.read_csv('data/train_y_df.csv')
test_x_df = pd.read_csv('data/test_x_df.csv')
train_x_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,9,0.983614,0.983614,0.983128,0.983246,0.001334,10.650987,0.009855,0.000848,6.771755
1,0,1,9,0.983245,0.983612,0.982453,0.982693,0.001425,11.375689,0.016137,0.000697,5.565188
2,0,2,9,0.982694,0.983612,0.982403,0.983002,0.001542,12.301942,0.014166,0.000905,7.225459
3,0,3,9,0.983009,0.984848,0.983009,0.984486,0.00252,20.134695,0.021557,0.001171,9.353
4,0,4,9,0.984233,0.984606,0.983612,0.984164,0.002818,22.515448,0.021434,0.001799,14.372534


In [108]:
import numpy as np
def df2d_to_array3d(df_2d):
    feature_size = len(df_2d.columns[3:]) # 9
    time_size = len(df_2d.time.unique()) # x는 1380, y는 120
    sample_size = len(df_2d.sample_id.unique()) # train은 7661, test는 535
    array_3d = df_2d.iloc[:,3:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

train_x_array = df2d_to_array3d(train_x_df)
train_y_array = df2d_to_array3d(train_y_df)
test_x_array = df2d_to_array3d(test_x_df)

print(f'''
These shape stands for (sample_size, time_step, feature)
train_x_array {train_x_array.shape}
train_y_array {train_y_array.shape}
test_x_array {test_x_array.shape}
''')


These shape stands for (sample_size, time_step, feature)
train_x_array (7661, 1380, 9)
train_y_array (7661, 120, 9)
test_x_array (535, 1380, 9)



In [109]:
X = train_x_array[:, :, [0, 4]]
y = train_y_array[:, :, [0, 4]]
X.shape, y.shape

((7661, 1380, 2), (7661, 120, 2))

In [110]:
y_argmax = []
for i in range(len(y)):
    y_argmax.append(y[i].argmax())
y_argmax = np.array(y_argmax)
y = y_argmax

In [111]:
from sklearn.preprocessing import MinMaxScaler
def get_avg_5(array):
    new_array = []
    scaler = MinMaxScaler()
    scaler.fit(array[:, [1]])
    array[:, [1]] = scaler.transform(array[:, [1]])
    for i in range(0, len(array), 5):
        new_array.append(array[i:i+5, 0].mean())
        new_array.append(np.median(array[i:i+5, 1]))
        
    return np.array(new_array)

X = np.array(list(map(get_avg_5, X)))
X.shape

(7661, 552)

# 각종 회귀

```python
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
```

In [118]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(X)

PCA(n_components=100)

In [119]:
X = pca.transform(X)
X.shape

(7661, 100)

## Linear regression

선형회귀는 딱히 그리드서치할 게 없으므로 train_test_split 사용

In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, y_train.shape

((6128, 100), (6128,))

In [121]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5592.101198141021
r2: -0.011542756925796427


## Ridge Regression

In [122]:
from sklearn.linear_model import RidgeCV
ri = RidgeCV(cv=5, alphas=np.logspace(-6, 6, 13))
ri.fit(X, y)

RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
        cv=5)

In [123]:
y_pred = ri.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5457.026088202929
r2: 0.012890679497753021


## Lasso Regression

In [124]:
from sklearn.linear_model import LassoCV
la = LassoCV(cv=5, random_state=0, alphas=np.logspace(-6, 6, 13))
la.fit(X, y)

LassoCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
        cv=5, random_state=0)

In [125]:
y_pred = la.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5471.012212144087
r2: 0.010360760623100096


## random forest

In [126]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [127]:
y_pred = rf.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5597.132418134377
r2: -0.012452843128183178


```python
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6, 10]
}
model_rf = GridSearchCV(rf, params)
model_rf.fit(X, y)
y_pred = model_rf.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## Adaboost

In [128]:
from sklearn.ensemble import AdaBoostRegressor
ad = AdaBoostRegressor()
ad.fit(X_train, y_train)

AdaBoostRegressor()

In [129]:
y_pred = ad.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5542.447369166203
r2: -0.002560986161382006


```python
from sklearn.ensemble import AdaBoostRegressor
ad = AdaBoostRegressor()
params = {
    'n_estimators': [50, 100], 
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'loss': ['linear', 'square', 'exponential']
}
model_ad = GridSearchCV(ad, params)
model_ad.fit(X, y)
y_pred = model_ad.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## LightGBM

In [130]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

LGBMRegressor()

In [131]:
y_pred = lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 5726.578335932301
r2: -0.03586802749674067


```python
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'n_estimators': [100, 150, 200], 
    'max_depth': [3, 5, 7]
}
model_lgbm = GridSearchCV(lgbm, params)
model_lgbm.fit(X, y)
y_pred = model_lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```

## xgboost

In [132]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [133]:
y_pred = xgb.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

mse: 6289.293402587303
r2: -0.1376563052333999


In [134]:
y_pred

array([ 81.96342 , 130.18817 , 117.39328 , ..., 119.636444,  86.24676 ,
       112.33529 ], dtype=float32)

```python
from xgboost import XGBRegressor
xgb = XGBRegressor()
params = {
    'booster': ['gbtree', 'gblinear', 'dart'], 
    'max_depth': [3, 5, 7], 
    'n_estimators': [100, 150, 200], 
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]}
model_xgb = GridSearchCV(xgb, params)
model_xgb.fit(X, y)
y_pred = model_xgb.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))
```