# Scratch

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy import signal

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import Lars, LassoLars, OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge, ARDRegression, PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor, HuberRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import argparse
# import wandb
# wandb.init(project="DACON_235927", name="scratch")

parser = argparse.ArgumentParser(description="scratch")
parser.add_argument('--lowpass', default=0.08, type=float)
parser.add_argument('--ewm_alpha', default=0.07, type=float)
parser.add_argument('--best_n', default=5, type=int)
parser.add_argument('--scaler', default="standard", type=str) # standard or minmax
parser.add_argument('--cv', default=10, type=int)
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

# wandb.config.update(args)

lowpass = args.lowpass
ewm_alpha = args.ewm_alpha
best_n = args.best_n
scaler = args.scaler
cv = args.cv
seed = args.seed

if scaler == "standard":
    scaler = StandardScaler()
elif scaler == "minmax":
    scaler = MinMaxScaler()

def set_seeds(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.head()

Unnamed: 0,ID,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TRAIN_00001,70.544,103.32,67.47,1,101.892,74.983,29.45,62.38,245.71,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.47,-25.409,-25.304
1,TRAIN_00002,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,TRAIN_00003,72.583,103.32,64.07,1,103.153,72.943,28.81,105.77,272.2,...,31.801,17.08,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.37
3,TRAIN_00004,71.563,103.32,67.57,1,101.971,77.022,28.92,115.21,255.36,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,TRAIN_00005,69.524,103.32,63.57,1,101.981,70.904,29.68,103.38,241.46,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974


In [2]:
x_feature_info = pd.read_csv("data/meta/x_feature_info.csv")
x_feature_info.head(len(x_feature_info))

Unnamed: 0,Feature,설명
0,X_01,PCB 체결 시 단계별 누름량(Step 1)
1,X_02,PCB 체결 시 단계별 누름량(Step 2)
2,X_03,방열 재료 1 무게
3,X_04,1차 검사 통과 여부
4,X_05,PCB 체결 시 단계별 누름량(Step 3)
5,X_06,PCB 체결 시 단계별 누름량(Step 4)
6,X_07,방열 재료 1 면적
7,X_08,방열 재료 2 면적
8,X_09,방열 재료 3 면적
9,X_10,방열 재료 2 무게


In [3]:
y_feature_info = pd.read_csv("data/meta/y_feature_info.csv")
y_feature_info.head(len(y_feature_info))

Unnamed: 0,Feature,설명
0,Y_01,안테나 Gain 평균 (각도1)
1,Y_02,안테나 1 Gain 편차
2,Y_03,안테나 2 Gain 편차
3,Y_04,평균 신호대 잡음비
4,Y_05,안테나 Gain 평균 (각도2)
5,Y_06,신호대 잡음비 (각도1)
6,Y_07,안테나 Gain 평균 (각도3)
7,Y_08,신호대 잡음비 (각도2)
8,Y_09,신호대 잡음비 (각도3)
9,Y_10,신호대 잡음비 (각도4)


In [4]:
y_feature_spec_info = pd.read_csv("data/meta/y_feature_spec_info.csv")
y_feature_spec_info.head(len(y_feature_spec_info))

Unnamed: 0,Feature,최소,최대
0,Y_01,0.2,2.0
1,Y_02,0.2,2.1
2,Y_03,0.2,2.1
3,Y_04,7.0,19.0
4,Y_05,22.0,36.5
5,Y_06,-19.2,19.0
6,Y_07,2.4,4.0
7,Y_08,-29.2,-24.0
8,Y_09,-29.2,-24.0
9,Y_10,-30.6,-20.0


In [5]:
train = train.drop(["ID"], axis=1)
test = test.drop(["ID"], axis=1)

X_train_df = train.filter(regex='X')
y_train_df = train.filter(regex='Y')
X_test_df = test.filter(regex='X')

X_train_df.shape, y_train_df.shape, X_test_df.shape

((39607, 56), (39607, 14), (39608, 56))

## Preprocessing

In [6]:
X_df = pd.concat([X_train_df, X_test_df], axis=0).reset_index(drop=True)
X_df.shape

(79215, 56)

In [7]:
drop_columns = [] # 검사 통과 여부
for col in X_df.columns:
    if X_df[col].nunique() == 1:
        drop_columns.append(col)
        
drop_columns

['X_04', 'X_23', 'X_47', 'X_48']

In [8]:
drop_columns = drop_columns + ["X_02", "X_10", "X_11"]
drop_columns

['X_04', 'X_23', 'X_47', 'X_48', 'X_02', 'X_10', 'X_11']

In [9]:
def feature_smoothing(df, lowpass):
    temp = df.copy()
    if lowpass < 1:
        b, a = signal.butter(1, lowpass, btype='lowpass')
        for col in temp.columns:
            temp[col] = signal.filtfilt(b, a, temp[col])
    return temp

In [10]:
X_train_df = X_train_df.drop(drop_columns, axis=1)
X_test_df = X_test_df.drop(drop_columns, axis=1)

X_train_df_smoothing = feature_smoothing(X_train_df, lowpass)
X_test_df_smoothing = feature_smoothing(X_test_df, lowpass)

X_train_df_ewm = X_train_df.ewm(alpha=ewm_alpha).mean()
X_test_df_ewm = X_test_df.ewm(alpha=ewm_alpha).mean()

y_train_df_smoothing = y_train_df

X_train_df_smoothing = pd.concat([X_train_df, X_train_df_smoothing, X_train_df_ewm], axis=1)
X_test_df_smoothing = pd.concat([X_test_df, X_test_df_smoothing, X_test_df_ewm], axis=1)

X_train_df_smoothing.shape, X_train_df_smoothing.shape, y_train_df_smoothing.shape

((39607, 147), (39607, 147), (39607, 14))

## Modelling

In [11]:
lr = LinearRegression(n_jobs=-1)
ridge = Ridge(random_state=seed)
lasso = Lasso(random_state=seed)
en = ElasticNet(random_state=seed)
lar = Lars(random_state=seed)
llar = LassoLars(random_state=seed)
omp = OrthogonalMatchingPursuit()
br = MultiOutputRegressor(BayesianRidge())
ard = MultiOutputRegressor(ARDRegression())
par = MultiOutputRegressor(PassiveAggressiveRegressor(random_state=seed))
ransac = RANSACRegressor(random_state=seed)
# tr = MultiOutputRegressor(TheilSenRegressor(n_jobs=-1, random_state=seed))
huber = MultiOutputRegressor(HuberRegressor())
# kr = KernelRidge()
# svm = MultiOutputRegressor(SVR())
knn = KNeighborsRegressor(n_jobs=-1)
dt = DecisionTreeRegressor(random_state=seed)
et = ExtraTreeRegressor(random_state=seed)
bagging = BaggingRegressor(n_jobs=-1, random_state=seed)
ets = ExtraTreesRegressor(n_jobs=-1, random_state=seed)
rf = RandomForestRegressor(n_jobs=-1, random_state=seed)
# ada = MultiOutputRegressor(AdaBoostRegressor(random_state=seed))
# gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=seed))
hgbr = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=seed))
xgboost = XGBRegressor(tree_method='gpu_hist', gpu_id=0, n_jobs=-1, random_state=seed)
lightgbm = MultiOutputRegressor(LGBMRegressor(n_jobs=-1, random_state=seed))
catboost = MultiOutputRegressor(CatBoostRegressor(task_type="GPU", devices='0', verbose=False, random_state=seed))
# mlp = MLPRegressor(random_state=seed)

In [12]:
base_ml = [ets, rf, hgbr, lightgbm, catboost]

## Training

In [13]:
def get_stacking_ml_datasets(model, X_train_n, y_train_n, X_test_n, n_folds, fitting=True):
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], y_train_n.shape[1]))
    test_pred = np.zeros((X_test_n.shape[0], y_train_n.shape[1], n_folds))
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        if fitting == True:
            model.fit(X_tr, y_tr)
        train_fold_pred[valid_index] = model.predict(X_te)
        test_pred[:, :, folder_counter] = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis=2)
    
    return train_fold_pred, test_pred_mean

In [14]:
def lg_nrmse(gt, preds):
    all_nrmse = []
    for idx in range(gt.shape[1]):
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [15]:
X_train = X_train_df_smoothing
y_train = y_train_df_smoothing
X_test = X_test_df_smoothing

X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

X_train.shape, X_test.shape

((39607, 147), (39608, 147))

In [16]:
meta_ml_X_train = []
meta_ml_X_test = []
for estimator in base_ml:
    print(estimator)
    # start = time.time()
    temp_X_train, temp_X_test = get_stacking_ml_datasets(estimator, X_train, y_train.values, X_test, cv)
    meta_ml_X_train.append(temp_X_train)
    meta_ml_X_test.append(temp_X_test)
    # end = time.time()
    # print(f"{end - start:.2f} sec")
    
meta_ml_X_train = np.mean(meta_ml_X_train, axis=0)
meta_ml_X_test = np.mean(meta_ml_X_test, axis=0)

meta_ml_X_train.shape, meta_ml_X_test.shape

ExtraTreesRegressor(n_jobs=-1, random_state=1011)
RandomForestRegressor(n_jobs=-1, random_state=1011)
MultiOutputRegressor(estimator=HistGradientBoostingRegressor(random_state=1011))
MultiOutputRegressor(estimator=LGBMRegressor(random_state=1011))
MultiOutputRegressor(estimator=<catboost.core.CatBoostRegressor object at 0x000002B07C851760>)


((39607, 14), (39608, 14))

In [17]:
meta_clf = LinearRegression()
meta_clf.fit(meta_ml_X_train, y_train)
prediction = meta_clf.predict(meta_ml_X_test)

result = prediction.round(3)

for col in range(len(y_feature_spec_info)):
    result[:, col] = np.where(result[:, col] > y_feature_spec_info.iloc[col]["최대"],
                              y_feature_spec_info.iloc[col]["최대"], result[:, col])
    result[:, col] = np.where(result[:, col] < y_feature_spec_info.iloc[col]["최소"],
                              y_feature_spec_info.iloc[col]["최소"], result[:, col])
    
result.shape

(39608, 14)

## Inference

In [18]:
submission = pd.read_csv("data/sample_submission.csv")

for idx, col in enumerate(submission.columns):
    if col == 'ID':
        continue
    submission[col] = result[:, idx-1]
    
submission.to_csv("submission.csv", index=False)