# imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns
import os
import missingno as msno
import pickle
from glob import glob
from sklearn.model_selection import cross_val_score
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
import gc
import joblib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
# model
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 

In [3]:
# input path
path = (os.path.abspath("../input"))

panel = pd.read_csv(path + '/panel.csv', encoding = "utf-8")
survey = pd.read_csv(path + '/survey.csv', encoding = "utf-8")
response_train = pd.read_csv(path + '/response_train.csv', encoding = "utf-8")
response_test = pd.read_csv(path + '/response_test.csv', encoding = "utf-8")
sub = pd.read_csv(path + "/sample_submission.csv")

# 1. Feature **Preprocessing**



>  panel 신상정보보다 SQ1~SQ3(성별, 생년월일, 지역)설문응답이 더 정확하다 판단


> SQ1~SQ3의 결측값을 신상정보 feature로 매꾸고 이것으로 신상정보 대체



In [3]:
# train에만 존재하는 GENDER, BIRTH 결측값 가진 userID제거
panel = panel.drop(index = panel[panel.userID == "p08142"].index)


# panel의 SQ1의 결츨값을 GENDER로 메꾸기
for i in panel[panel.SQ2.isnull()].index:
    panel.loc[i, "SQ1"] = panel.loc[i, "GENDER"]


# penal의 SQ2의 결측값을 BIRTH로 메꾸기
for i in panel[panel.SQ2.isnull()].index:
    panel.loc[i, "SQ2"] = panel.loc[i, "BIRTH"]

# SQ2에 생년월일 대답 이상하게 한 userID는 BIRTH를 SQ2로
idx = panel[panel.SQ2 < 10].index
panel.loc[idx, "SQ2"] = panel.loc[idx, "BIRTH"]

In [4]:
# REGION 열 변환용 사전
region_lst = ["서울", "세종", "인천", "대전", "광주", "대구", "울산", "부산", "경기", "강원", 
              "충북", "충남", "전북", "전남", "경북", "경남", "제주"]
pow_lst = [np.power(2, i) for i in range(1, 18)]
region_dict = {x:y for x, y in zip(pow_lst, region_lst)}

# SQ3 열 변환용 사전
sq3_lst = ["서울", "부산", "대구", "울산", "광주", "대전", "인천", "경기", "충북", "충남",
           "경북", "경남", "전북", "전남", "강원", "제주", "세종", "기타/해외"]
idx_lst = list(range(1, 18))
idx_lst.append(97)
idx_lst = list(map(float, idx_lst))
sq3_dict = {x:y for x, y in zip(idx_lst, sq3_lst)}

# SQ3 데이터타입 통일 (최종 float으로 바꿔주기)
panel.SQ3 = panel.SQ3.astype("str")

def clean(x):
    if "," in x:
        return "1"
    else:
        return x

panel.SQ3 = panel.SQ3.map(clean)
panel.SQ3 = panel.SQ3.astype("float64")

# REGION과 SQ3 최종변환
panel["REGION"] = panel["REGION"].map(region_dict)
panel["SQ3"] = panel["SQ3"].map(sq3_dict)

# SQ3 결측값을 REGION으로 메꾸기
for i in panel[panel.SQ3.isnull()].index:
    panel.loc[i, "SQ3"] = panel.loc[i, "REGION"]

# panel 신상정보faetures drop
panel = panel.drop(["REGION", "GENDER", "BIRTH"], axis = 1)

# SQ 1, 2, 3에 새로운 이름
panel = panel.rename(columns = {"SQ1" : "GENDER", "SQ2" : "BIRTH", "SQ3": "REGION"})

In [5]:
survey_col = panel.loc[:, "SQ4":"DQ7"].columns

for i in survey_col:
    del panel[i]
gc.collect()

61



> Data Merge



In [6]:
train = response_train.merge(panel).merge(survey)
test = response_test.merge(panel).merge(survey).sort_values(by='ID')
test = test.drop("ID", axis = 1)
del panel, survey, response_test, response_train
gc.collect()

0



> features generation



In [7]:
# 날짜/시간 데이터
train.TIME = train.TIME.astype("datetime64")
test.TIME = test.TIME.astype("datetime64")

# train["month"] = train.TIME.dt.month
# train["day"] = train.TIME.dt.day
train["hour"] = train.TIME.dt.hour
train["weekday"] = train.TIME.dt.dayofweek.map({0:"MON", 1:"TUE", 2:"WED", 3:"THU", 4:"FRI", 5:"SAT", 6:"SUN"})

# test["month"] = test.TIME.dt.month
# test["day"] = test.TIME.dt.day
test["hour"] = test.TIME.dt.hour
test["weekday"] = test.TIME.dt.dayofweek.map({0:"MON", 1:"TUE", 2:"WED", 3:"THU", 4:"FRI", 5:"SAT", 6:"SUN"})

train = train.drop("TIME", axis = 1)
test = test.drop("TIME", axis = 1)
train.head()

Unnamed: 0,userID,surveyID,STATUS,TYPE,GENDER,BIRTH,REGION,TITLE,IR,LOI,CATEGORIES,CPI,hour,weekday
0,p04802,s00004,0,B,2.0,1990.0,경기,해외 - 일반인 의견 조사 (DR 20200531-001)S,100,5,,275.0,2,MON
1,p04685,s00004,0,B,2.0,1984.0,인천,해외 - 일반인 의견 조사 (DR 20200531-001)S,100,5,,275.0,2,MON
2,p01206,s00004,1,D,2.0,1983.0,대전,해외 - 일반인 의견 조사 (DR 20200531-001)S,100,5,,275.0,2,MON
3,p00423,s00004,1,D,1.0,1994.0,서울,해외 - 일반인 의견 조사 (DR 20200531-001)S,100,5,,275.0,2,MON
4,p00328,s00004,1,D,2.0,1981.0,서울,해외 - 일반인 의견 조사 (DR 20200531-001)S,100,5,,275.0,2,MON


In [8]:
# 유저아이디 별 응답확률
res_rat = train.groupby('userID')['STATUS'].mean().reset_index().rename(columns = {"STATUS" : "res_rat"})
train = train.merge(res_rat, how='left')
test = test.merge(res_rat,how='left')

In [9]:
# 생년을 나이로
train["age"] = 2022- train["BIRTH"]
test["age"] = 2022- test["BIRTH"]

In [10]:
# title 유의미한 단어 빈도수 계산
import re

word_counts = {}
def count_word(x): # 응답한 서베이 제목에서 한글 단어만 분리하고 빈도 계산
    if x['STATUS'] == 1:
        for w in re.sub(r'[^ ㄱ-ㅣ가-힣]', '', x['TITLE']).split():
            word_counts[w] = word_counts.get(w, 0) + 1
def score_word(x): # 빈도의 합으로 제목을 Encoding
    score = 0
    for w in re.sub(r'[^ ㄱ-ㅣ가-힣]', '', x['TITLE']).split():
        score += word_counts.get(w, 0)
    return score    
            
train.apply(count_word, axis=1)
train.TITLE = train.apply(score_word, axis=1)
test.TITLE = test.apply(score_word, axis=1)

In [11]:
# hour mean encoding
hour_mean = train.groupby("hour").STATUS.mean()
train["hour"] = train["hour"].map(dict(hour_mean))
test["hour"] = test["hour"].map(dict(hour_mean))

In [12]:
# CPI mean encoding
cpi_mean = train.groupby("CPI").STATUS.mean()
train["CPI_mean"] = train["CPI"].map(dict(cpi_mean))
test["CPI_mean"] = test["CPI"].map(dict(cpi_mean))

In [13]:
test.CPI_mean = test.CPI_mean.fillna(train.CPI_mean.mean())
#test.IR = test.IR.fillna(train.IR.mean())

In [14]:
# X_train, y_train, X_test split
X_train = train.drop("STATUS", axis = 1)
y_train = train.STATUS
X_test = test

del train, test
gc.collect()

88

In [15]:
# feature실험으로 영향없는 features drop하는 구간
X_train = X_train.drop(["userID", "surveyID","REGION", "LOI", "BIRTH"], axis = 1)
X_test = X_test.drop(["userID", "surveyID","REGION", "LOI", "BIRTH"], axis = 1)



>  Encoding



In [16]:
X_train["CATEGORIES"]=X_train["CATEGORIES"].fillna("unknown")
X_test["CATEGORIES"]=X_test["CATEGORIES"].fillna("unknown")
###################### CATEGORIES Label Encoding ######################
encoder = LabelEncoder()
# X_train데이터를 이용 피팅하고 라벨숫자로 변환한다
encoder.fit(X_train["CATEGORIES"])
X_train_encoded_cat = encoder.transform(X_train["CATEGORIES"])

# X_test데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가한다 (중요!!!)
for label in np.unique(X_test["CATEGORIES"]):
    if label not in encoder.classes_: # unseen label 데이터인 경우( )
        encoder.classes_ = np.append(encoder.classes_, label) # 미처리 시 ValueError발생
X_test_encoded_cat = encoder.transform(X_test["CATEGORIES"])

X_train["CATEGORIES"] = pd.DataFrame(X_train_encoded_cat)
X_test["CATEGORIES"] = pd.DataFrame(X_test_encoded_cat)

del X_train_encoded_cat, X_test_encoded_cat

###################### weekday Label Encoding ######################

# weekday Label Encoding
encoder = LabelEncoder()
# X_train데이터를 이용 피팅하고 라벨숫자로 변환한다
encoder.fit(X_train["weekday"])
X_train_encoded_week = encoder.transform(X_train["weekday"])

# X_test데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가한다 (중요!!!)
for label in np.unique(X_test["weekday"]):
    if label not in encoder.classes_: # unseen label 데이터인 경우( )
        encoder.classes_ = np.append(encoder.classes_, label) # 미처리 시 ValueError발생
X_test_encoded_week = encoder.transform(X_test["weekday"])


X_train["weekday"] = pd.DataFrame(X_train_encoded_week)
X_test["weekday"] = pd.DataFrame(X_test_encoded_week)

del X_train_encoded_week, X_test_encoded_week

In [17]:
#  GENDER는 수치에 의미가 없어 문자열로 변경해 인코딩할 feature에 포함
X_train["GENDER"] = X_train["GENDER"].astype("str")
X_test["GENDER"] = X_test["GENDER"].astype("str")

features = pd.concat([X_train, X_test])
features = pd.get_dummies(features)

X_train = features.iloc[:y_train.shape[0], :]
X_test = features.iloc[y_train.shape[0]:, :]

del features



> features scaling



In [19]:
# standard scaling
scaler = StandardScaler()
num = ["IR","CPI","age","CATEGORIES","TITLE", "weekday", "res_rat", "CPI_mean"]

X_train[num] = scaler.fit_transform(X_train[num])
X_test[num] = scaler.transform(X_test[num])

# to_csv

In [22]:
X_train.to_csv(path + "/X_train_lgbm.csv", index = False)
X_test.to_csv(path + "/X_test_lgbm.csv", index = False)
y_train.to_csv(path + "/y_train_lgbm.csv", index = False)