In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
filepath_home= 'D:/Dacon_Psychological_disposition/'
filepath_out = 'C:/임시/Dacon_Psychological_disposition/' 
train = pd.read_csv(filepath_home+'train_na_filled.csv')
test = pd.read_csv(filepath_home+'test_na_filled.csv')

In [3]:
# 분할 지점 저장해두고 데이터 통합 후 전처리
split_point = train.shape[0]
data = pd.concat([train.drop('voted',axis=1),test],ignore_index=True)

In [4]:
# 마키아 스코어 계산
total_negative_cols = ["QeA", "QfA", "QkA", "QqA", "QrA", "QaA", "QdA", "QgA", "QiA", "QnA"]
data[total_negative_cols] = 6-data[total_negative_cols]
data['mach_score'] = data.apply(lambda x:x.iloc[0:40:2].mean(),axis=1)

In [5]:
# tipi 성격 구분 5 칼럼 추가
data[['tp02','tp04','tp06','tp08','tp10']]= 7-data[['tp02','tp04','tp06','tp08','tp10']]

data['Extraversion']=(data.tp01+data.tp06)/2
data['Agreeableness']=(data.tp02+data.tp07)/2
data['Conscientiousness']=(data.tp03+data.tp08)/2
data['Emotional Stability']=(data.tp04+data.tp09)/2
data['Openness to Experiences']=(data.tp05+data.tp10)/2

In [6]:
# 각 답변 시간을 빠르다/느리다 로 이분화 후 전체 설문의 답변시간을 0~20 사이로 수치화
total_median = np.median(np.ravel(train.iloc[:,1:40:2]))
temp_df = np.where(data.iloc[:,1:40:2]>total_median,1,0)
data['Q_total_E'] = temp_df.sum(axis=1) # 0~20

In [7]:
data.shape,data.columns

((56912, 83),
 Index(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA', 'QeE',
        'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE', 'QjA', 'QjE',
        'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA', 'QnE', 'QoA', 'QoE',
        'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE', 'QsA', 'QsE', 'QtA', 'QtE',
        'age_group', 'education', 'engnat', 'familysize', 'gender', 'hand',
        'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
        'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01', 'wf_02',
        'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06', 'wr_07',
        'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13', 'mach_score',
        'Extraversion', 'Agreeableness', 'Conscientiousness',
        'Emotional Stability', 'Openness to Experiences', 'Q_total_E'],
       dtype='object'))

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [9]:
data.iloc[:,0:40:2] = MinMaxScaler().fit_transform(data.iloc[:,0:40:2]) # 질문 답변 내용 정보 정규화

In [10]:
data['mach_score'] = MinMaxScaler().fit_transform(data['mach_score'].values.reshape(-1,1)) # 마키아벨리니즘 스코어 정규화

In [11]:
big_5_personality_list = ['Extraversion','Agreeableness','Conscientiousness','Emotional Stability','Openness to Experiences']
data[big_5_personality_list] = MinMaxScaler().fit_transform(data[big_5_personality_list]) # TIPI 5 유형 점수 정규화

In [12]:
data['Q_total_E'] = MinMaxScaler().fit_transform(data['Q_total_E'].values.reshape(-1,1)) # 마키아벨리니즘 스코어 정규화

In [13]:
data['familysize'] = StandardScaler().fit_transform(data['familysize'].values.reshape(-1,1)) # 형제자매 정보 표준화

In [14]:
object_cols = ['gender','race','religion','married','hand','urban']
for col in object_cols:
    dummy = pd.get_dummies(data[col],prefix=col)
    data=data.join(dummy[dummy.columns[:-1]])
    data=data.drop([col],axis=1)

In [15]:
data.age_group = data.age_group.map({ # 연령대 정보 순서 데이터 변환
    '10s':1,'20s':2,'30s':3,
    '40s':4,'50s':5,'60s':6,'+70s':7
})
data['age_group'] = MinMaxScaler().fit_transform(data['age_group'].values.reshape(-1,1)) # 연령대 정보 정규화

In [16]:
data['education'] = MinMaxScaler().fit_transform(data['education'].values.reshape(-1,1)) # 교육수준 정보 정규화

In [17]:
data.iloc[:,1:40:2] = MinMaxScaler().fit_transform(np.log10(data.iloc[:,1:40:2]+1)) # 질문 답변시간 정보 로그 스케일

In [18]:
train_preprocessed = data.iloc[:split_point].join(train.voted)
test_preprocessed = data.iloc[split_point:]
train_preprocessed.shape,test_preprocessed.shape

((45529, 102), (11383, 101))

In [19]:
train_preprocessed.to_csv(filepath_home+'train_preprocessed.csv',index=False)
test_preprocessed.to_csv(filepath_home+'test_preprocessed.csv',index=False)