In [16]:
import numpy as np
# ============================================================
# 이상치/무응답 처리
# ============================================================
def clean_data(df):
    df = df.copy()
    
    # 무응답 (0 → NaN -> Mean)
    for col in ['education', 'engnat', 'hand', 'married', 'urban']:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan
            col_mean = df[col].mean()
            df[col] = df[col].fillna(col_mean)
    
    # familysize
    if 'familysize' in df.columns:
        df.loc[df['familysize'] == 0, 'familysize'] = np.nan
        df.loc[df['familysize'] > 15, 'familysize'] = np.nan
    
    # TP 0 → NaN -> Mean
    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan
            col_mean = df[col].mean()
            df[col] = df[col].fillna(col_mean)
    
    # Q_E 클리핑
    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)
    
    return df

In [17]:
import numpy as np
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

# ============================================================
# 피처 엔지니어링
# ============================================================
def build_features(train_df, test_df):
    train = train_df.copy()
    test = test_df.copy()

    dataset = [train, test]

    # Q_A
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    # Tactic/ Morality/ View에 따라 feature 항목 분류
    for data in dataset:
        data['T'] = data['QcA'] - data['QfA'] + data['QoA'] - data['QrA'] + data['QsA']
        data['V'] = data['QbA'] - data['QeA'] + data['QhA'] + data['QjA'] + data['QmA'] - data['QqA']
        data['M'] = - data['QkA']

    # 역채점 질문들
    flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA"]
    for data in dataset:
        for flip in flipping_columns: 
            data[flip] = 6 - data[flip]
    # Secret 질문들
    flipping_secret_columns = ["QaA", "QdA", "QgA", "QiA", "QnA"]
    for data in dataset:
        for flip in flipping_secret_columns: 
            data[flip] = 6 - data[flip]

    for data in dataset:
        data['Mach_score'] = data[qa_cols].mean(axis = 1)
    
    # Q_E 모든 컬럼의 값을 행 단위로 모두 더한 후 그 합계에 대해 거듭제곱 연산을 수행하여 delay 생성 (하나의 지표로 만듬)
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for data in dataset:
        data['delay'] = data[qe_cols].sum(axis=1)
        data['delay'] = data['delay'] ** (1/10)

    # 원본 Q_A, Q_E 제거
    for data in dataset:
        data.drop(qa_cols, axis = 1, inplace = True)
        data.drop(qe_cols, axis = 1, inplace = True)

    # TP Big5
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    for data in dataset:
        data["extraversion"] = data["tp01"] - data["tp06"]
        data["agreeableness"] = data["tp07"] - data["tp02"]
        data["conscientiousness"] = data["tp03"] - data["tp08"]
        data["neuroticism"] = data["tp04"] - data["tp09"]
        data["openness"] = data["tp05"] - data["tp10"]

    # 10대인지 여부
    for data in dataset:
        teenager_ox = 1*np.array(data['age_group'] == '10s')
        data['teenager_ox'] = teenager_ox

    # 성별에 따라 Emotional Stability/ Conscience/ Open Minded가 투표 여부에 미치는 영향
    encoder = LabelEncoder()
    needenco = ['age_group', 'gender', 'race', 'religion']
    for i in needenco:
        train[i] = encoder.fit_transform(train[i])
        test[i] = encoder.transform(test[i])
    for data in dataset:
        data['Es_gender'] = data['neuroticism']*data['gender']
        data['Con_gender'] = data['conscientiousness']*data['gender']
        data['Op_gender'] = data['openness']*data['gender']

    # 큰 영향이 없다고 판단된 feature들을 제거
    # index, hand, wr의 일부
    drop_list = ['index', 'hand']
    for data in dataset:
        data.drop(drop_list, axis=1, inplace = True)

    wr_list = [('wr_0'+str(i)) for i in range(1,10)]
    wr_list.extend([('wr_'+str(i)) for i in range(10,14)])
    wr_no_need = [i for i in wr_list if i not in ['wr_01', 'wr_03', 'wr_06', 'wr_09', 'wr_11']]
    for data in dataset:
        data.drop(wr_no_need, axis=1, inplace = True)
    
    return train, test

In [18]:
# 적용 예시
import pandas as pd
import os
DATA_PATH = "../../data/raw/"

# 1. 데이터 로드
train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
test = pd.read_csv(os.path.join(DATA_PATH, "test_x.csv"))

# 2. 전처리 및 피처 엔지니어링 (제공해주신 함수 활용)
train = clean_data(train)
test = clean_data(test)

train, test = build_features(train, test)

In [19]:
print(train.columns)

Index(['age_group', 'education', 'engnat', 'familysize', 'gender', 'married',
       'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06',
       'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'voted', 'wf_01', 'wf_02',
       'wf_03', 'wr_01', 'wr_03', 'wr_06', 'wr_09', 'wr_11', 'T', 'V', 'M',
       'Mach_score', 'delay', 'extraversion', 'agreeableness',
       'conscientiousness', 'neuroticism', 'openness', 'teenager_ox',
       'Es_gender', 'Con_gender', 'Op_gender'],
      dtype='object')
