In [None]:
import pandas as pd
train_x_df = pd.read_csv('data/train_x_df.csv')
train_y_df = pd.read_csv('data/train_y_df.csv')
test_x_df = pd.read_csv('data/test_x_df.csv')
train_x_df.head()

In [None]:
import numpy as np
def df2d_to_array3d(df_2d):
    feature_size = len(df_2d.columns[3:]) # 9
    time_size = len(df_2d.time.unique()) # x는 1380, y는 120
    sample_size = len(df_2d.sample_id.unique()) # train은 7661, test는 535
    array_3d = df_2d.iloc[:,3:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

train_x_array = df2d_to_array3d(train_x_df)
train_y_array = df2d_to_array3d(train_y_df)
test_x_array = df2d_to_array3d(test_x_df)

print(f'''
These shape stands for (sample_size, time_step, feature)
train_x_array {train_x_array.shape}
train_y_array {train_y_array.shape}
test_x_array {test_x_array.shape}
''')

In [None]:
X = train_x_array[:, :, 0]
y = train_y_array[:, :, 0]
X.shape, y.shape

In [None]:
y_argmax = []
for i in range(len(y)):
    y_argmax.append(y[i].argmax())
y_argmax = np.array(y_argmax)
y = y_argmax

In [None]:
def get_avg_5(array):
    new_array = []
    for i in range(0, len(array), 5):
        new_array.append(array[i:i+5].mean())
    return np.array(new_array)

X = np.array(list(map(get_avg_5, X)))
X.shape

# 각종 회귀

In [None]:
from sklearn.model_selection import GridSearchCV

## linear regression

선형회귀는 딱히 그리드서치할 게 없으므로 train_test_split 사용

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, y_train.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
ri = RidgeCV(cv=5, alphas=np.logspace(-6, 6, 13))
ri.fit(X, y)

In [None]:
y_pred = ri.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
la = LassoCV(cv=5, random_state=0, alphas=np.logspace(-6, 6, 13))
la.fit(X, y)

In [None]:
y_pred = la.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
params = {
#     'n_estimators': [100, 150, 200],
#     'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6, 10]
}
model_rf = GridSearchCV(rf, params)
model_rf.fit(X, y)

In [None]:
y_pred = model_rf.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ad = AdaBoostRegressor()
params = {
#     'n_estimators': [50, 100], 
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
    'loss': ['linear', 'square', 'exponential']
}
model_ad = GridSearchCV(ad, params)
model_ad.fit(X, y)

In [None]:
y_pred = model_ad.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## LightGBM

In [None]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1.0], 
#     'n_estimators': [100, 150, 200], 
#     'max_depth': [3, 5, 7]
}
model_lgbm = GridSearchCV(lgbm, params)
model_lgbm.fit(X, y)

In [None]:
y_pred = model_lgbm.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))

## xgboost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
params = {
    'booster': ['gbtree', 'gblinear', 'dart'], 
    'max_depth': [3, 5, 7], 
    'n_estimators': [100, 150, 200], 
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]}
model_xgb = GridSearchCV(xgb, params)
model_xgb.fit(X, y)

In [None]:
y_pred = model_xgb.predict(X_test)
print('mse:', mse(y_test, y_pred))
print('r2:', r2(y_test, y_pred))