# Import

In [None]:
import pandas as pd
import random
import os
import time
import numpy as np
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import math
from filterpy.kalman import KalmanFilter
from filterpy.common import Q_discrete_white_noise
import itertools
import math

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(1422) # Seed 고정

In [None]:
train_df = pd.read_csv('./data/train.csv')

In [None]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

# Feature Engineering

## Train data

### Kalman Filter

In [None]:
train_x.drop(["X_10","X_11"], axis =1 ,inplace =True)

basic_col = train_x.columns

for i in tqdm(range(len(train_x.columns))):
    current=0
    sum_c=[]
    z = train_x.loc[:, train_x.columns[i]]
    a = []           #필터링 된 피쳐(after)
    b = []           #필터링 전 피쳐(before)
    my_filter = KalmanFilter(dim_x=2,dim_z=1) #create kalman filter
    my_filter.x = np.array([[2.],[0.]])       # initial state (location and velocity)
    my_filter.F = np.array([[1.,1.], [0.,1.]])    # state transition matrix
    my_filter.H = np.array([[1.,0.]])    # Measurement function
    my_filter.P *= 1000.                 # covariance matrix
    my_filter.R = 5                      # state uncertainty
    my_filter.Q = Q_discrete_white_noise(dim = 2,dt=.1,var=.1) # process uncertainty   
    for k in z.values:
        my_filter.predict()
        my_filter.update(k)
        # do something with the output
        x = my_filter.x
        a.extend(x[0])
        b.append(k)
    sum_c=sum_c+a
    train_x.loc[:,'kf_X_'+str(i)]=sum_c
train_x

### 가설 기반 Feature Engineering

In [None]:
temp_1 = ['X_14','X_15','X_16','X_17','X_18']
temp_2 = ['X_19','X_20','X_21','X_22']
temp_3 = ['X_30','X_31','X_32','X_33']

# 0값이 있는 피쳐 제외하고 log 변환 (log1p 사용 안함)
for col in basic_col:
    if col in ['X_38','X_39',"X_40","X_45"]:
        continue
    train_x[col+"_log"] = train_x[col].apply(lambda x : math.log(x))

# 가설 1. 안테나 패드 위치 간의 차이가 공정예측에 영향을 미칠 것임
for a,b in itertools.combinations(temp_1,2):
    train_x['dif_'+a+b] = train_x[a]-train_x[b]
    train_x['dif_'+b+a] = train_x[b]-train_x[a]
    
# 가설 2. 스크류 삽입 깊이의 차이가 공정예측에 영향을 미칠 것임 
for a,b in itertools.combinations(temp_2,2):
    train_x['dif_'+a+b] = train_x[a]-train_x[b]
    train_x['dif_'+b+a] = train_x[b]-train_x[a]
 
 # 가설 3. 스크류 삽입 깊이의 차이가 공정예측에 영향을 미칠 것임 
for a,b in itertools.combinations(temp_3,2):
    train_x['dif_'+a+b] = train_x[a]-train_x[b]
    train_x['dif_'+b+a] = train_x[b]-train_x[a]
    
# 가설 4. PCB 체결시 단계별 누름량 1,2의 총합이 공정예측에 영향을 미칠 것임
train_x['PCB_sum_1'] = train_x['X_01']+train_x['X_02']

# 가설 5. PCB 체결시 단계별 누름량 3,4의 총합이 공정예측에 영향을 미칠 것임
train_x['PCB_sum_2'] = train_x['X_05']+train_x['X_06']

# 가설 6. 방열재료의 총면적이 공정예측에 영향을 미칠 것임
train_x['radi_sum_area'] = train_x['X_07'] + train_x['X_08']+ train_x['X_09']

# 가설 7. 방열재료 면적당 무게가 공정예측에 영향을 미칠 것임
train_x['radi_1_weight_per_area'] = train_x['X_03'] / train_x['X_07']

# 가설 8. 레이돔 치수와 안테나 위치의 거리가 공정예측에 영향을 미칠 것임
train_x['abs_X_41X_14'] =abs(train_x["X_41"]- train_x['X_14'])
train_x['abs_X_42X_15'] =abs(train_x["X_42"]- train_x['X_15'])
train_x['abs_X_43X_16'] =abs(train_x["X_43"]- train_x['X_16'])
train_x['abs_X_44X_17'] =abs(train_x["X_44"]- train_x['X_17'])

# 가설 9. 안테나패드 위치, 스크류삽입깊이, 커넥터 핀 치수, 스크류삽입깊이의 편차(분산 또는 표준편차)가 공정예측에 영향을 미칠 것임
train_x['var_antena_loc'] = train_x[['X_14','X_15','X_16','X_17',"X_18"]].apply(lambda x : np.var(x), axis =1)
train_x['std_nthscrew_depth'] = train_x[['X_19','X_20','X_21','X_22']].apply(lambda x : np.std(x), axis =1)
train_x['std_connector'] = train_x[['X_24','X_25','X_26','X_27',"X_28","X_29"]].apply(lambda x : np.std(x), axis =1)
train_x['std_screw_depth'] = train_x[['X_30','X_31','X_32','X_33']].apply(lambda x : np.std(x), axis =1)

In [None]:
train_x.to_csv('./data/train_x_no_mean_median.csv', index=False) # 중간 저장

## Test data

### Kalman Filter

In [None]:
submit_x = pd.read_csv('./data/test.csv')
submit_x.drop(["ID","X_10","X_11"],axis =1 ,inplace =True)
submit_df = pd.read_csv('./data/sample_submission.csv')

basic_col = submit_x.columns

for i in tqdm(range(len(submit_x.columns))):
    current=0
    sum_c=[]
    z = submit_x.loc[:, submit_x.columns[i]]
    a = []           #필터링 된 피쳐(after)
    b = []           #필터링 전 피쳐(before)
    my_filter = KalmanFilter(dim_x=2,dim_z=1) #create kalman filter
    my_filter.x = np.array([[2.],[0.]])       # initial state (location and velocity)
    my_filter.F = np.array([[1.,1.], [0.,1.]])    # state transition matrix
    my_filter.H = np.array([[1.,0.]])    # Measurement function
    my_filter.P *= 1000.                 # covariance matrix
    my_filter.R = 5                      # state uncertainty
    my_filter.Q = Q_discrete_white_noise(dim = 2,dt=.1,var=.1) # process uncertainty   
    for k in z.values:
        my_filter.predict()
        my_filter.update(k)
        # do something with the output
        x = my_filter.x
        a.extend(x[0])
        b.append(k)
    sum_c=sum_c+a
    submit_x.loc[:,'kf_X_'+str(i)]=sum_c

### 가설 기반 Feature Engineering

In [None]:

for col in basic_col:
    if col in ['X_38','X_39',"X_40","X_45"]:
        continue
    submit_x[col+"_log"] = submit_x[col].apply(lambda x : math.log(x))


for a,b in itertools.combinations(temp_1,2):
  submit_x['dif_'+a+b] = submit_x[a]-submit_x[b]
  submit_x['dif_'+b+a] = submit_x[b]-submit_x[a]

for a,b in itertools.combinations(temp_2,2):
  submit_x['dif_'+a+b] = submit_x[a]-submit_x[b]
  submit_x['dif_'+b+a] = submit_x[b]-submit_x[a]


for a,b in itertools.combinations(temp_3,2):
  submit_x['dif_'+a+b] = submit_x[a]-submit_x[b]
  submit_x['dif_'+b+a] = submit_x[b]-submit_x[a]

submit_x['PCB_sum_1'] = submit_x['X_01']+submit_x['X_02']
submit_x['PCB_sum_2'] = submit_x['X_05']+submit_x['X_06']
submit_x['radi_sum_area'] = submit_x['X_07'] + submit_x['X_08']+ submit_x['X_09']
submit_x['radi_1_weight_per_area'] = submit_x['X_03'] / submit_x['X_07']

submit_x['abs_X_41X_14'] =abs(submit_x["X_41"]- submit_x['X_14'])
submit_x['abs_X_42X_15'] =abs(submit_x["X_42"]- submit_x['X_15'])
submit_x['abs_X_43X_16'] =abs(submit_x["X_43"]- submit_x['X_16'])
submit_x['abs_X_44X_17'] =abs(submit_x["X_44"]- submit_x['X_17'])


submit_x['var_antena_loc'] = submit_x[['X_14','X_15','X_16','X_17',"X_18"]].apply(lambda x : np.var(x), axis =1)
submit_x['std_nthscrew_depth'] = submit_x[['X_19','X_20','X_21','X_22']].apply(lambda x : np.std(x), axis =1)
submit_x['std_connector'] = submit_x[['X_24','X_25','X_26','X_27',"X_28","X_29"]].apply(lambda x : np.std(x), axis =1)
submit_x['std_screw_depth'] = submit_x[['X_30','X_31','X_32','X_33']].apply(lambda x : np.std(x), axis =1)

In [None]:
submit_x.to_csv('./data/test_x_no_mean_median.csv', index=False)

# 평균, 중간값 생성

In [None]:
class CFG:
    seed = 42
    fold_num = 10

    trainPath = './data/train_x_no_mean_median.csv'
    testPath = './data/test_x_no_mean_median.csv'
    
train_df = pd.read_csv(CFG.trainPath)
test_df = pd.read_csv(CFG.testPath)

train_new_df = train_df.copy()
test_new_df = test_df.copy()

# 이동 평균 구하는 함수
def make_mean_median(column_list, set_amount):
    mean_arr = []
    median_arr = []
    
    # 첫 set_amount까지는 본래값 사용
    for i in range(set_amount):
        mean_arr.append(column_list[i])
        median_arr.append(column_list[i])
    
    # set_amount만큼의 mean, median 생성
    for i in range(set_amount, len(column_list)):
        mean_arr.append(float(np.mean(column_list[i-set_amount:i])))
        median_arr.append(float(np.median(column_list[i-set_amount:i])))
        
    print(f'mean_arr:{len(mean_arr)}')
    print(f'median_arr:{len(median_arr)}')
    return mean_arr.copy(), median_arr.copy()

# 이동 중간값 구하는 함수
def make_move_avg_median(df, save_file_name, type=None, set_amount=30):
    pbar = tqdm(df.columns[:54])
    for column in pbar:
        column_list = df[column].to_list()
        pbar.set_description(f"The mean and median of {column} is making")
        meanArr, medianArr = make_mean_median(column_list, set_amount)
        
        df[f'{column}_mean_{set_amount}'] = meanArr
        df[f'{column}_median_{set_amount}'] = medianArr
        
    df.to_csv(save_file_name, index=False)

# 모델이 학습할 최종 데이터셋 추출
make_move_avg_median(train_df, save_file_name='./data/train_x_engineered.csv', type='train')
make_move_avg_median(test_df, save_file_name='./data/test_x_engineered.csv', type='test')

In [None]:
mean_arr = []
median_arr = []
    
# 첫 set_amount까지는 본래값 사용
for i in range(30):
    mean_arr.append(column_list[i])
    median_arr.append(column_list[i])
    
    # set_amount만큼의 mean, median 생성
for i in range(30, len(column_list)):
    mean_arr.append(float(np.mean(column_list[i-set_amount:i])))
    median_arr.append(float(np.median(column_list[i-set_amount:i])))