In [255]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from ipywidgets import interact
from datetime import datetime

In [253]:
# 환경설정
sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
if not any([s == 'result' for s in os.listdir('.')]): os.mkdir('result')

In [249]:
# 데이터 불러오기
train = pd.read_csv('data/train.csv').set_index('index')
test_x = pd.read_csv('data/test_x.csv').set_index('index')
submission = pd.read_csv('data/sample_submission.csv').set_index('index')

* Qb : The biggest difference between most criminals and other people is that the criminals are stupid enough to get caught.
 - 대부분의 범죄자들과 다른 사람들의 가장 큰 차이점은 범죄자들은 잡힐 만큼 어리석다는 것이다.
* Qc : Anyone who completely trusts anyone else is asking for trouble.
 - 다른 사람을 너무 믿는 사람은 문제를 스스로 자초하는 것이다.
* Qe : P.T. Barnum was wrong when he said that there's a sucker born every minute.
 - P.T. Barnum(미국 정치인)은 매 순간 어리버리가 태어난다고 했는데 그것은 틀렸다.
* Qf : There is no excuse for lying to someone else.
 - 거짓말 하는 것은 변명의 여지가 없다.
* Qh : Most people forget more easily the death of their parents than the loss of their property.
 - 대부분 사람들은 재산상 손해를 입는 것보다 부모의 죽음을 더 쉽게 잊는다.
* Qj : It is safest to assume that all people have a vicious streak and it will come out when they are given a chance.
 - 모든 사람들은 악랄한 기질을 가지고 있으며, 기회가 되면 그 기질이 나타난다 보는게 맞다.
* Qk : All in all, it is better to be humble and honest than to be important and dishonest.
 - 대체로, 거만하거나 부정직한 것보다는 겸손하고 정직한 것이 낫다.
* Qm : It is hard to get ahead without cutting corners here and there.
 - 돈을 아끼지 않고서는 출세하기 어렵다.
* Qo : The best way to handle people is to tell them what they want to hear.
 - 사람들 다루는 가장 좋은 방법은 그들이 듣고 싶은 말을 하는 것이다.
* Qq : Most people are basically good and kind.
 - 대부분의 사람들은 기본적으로 착하고 친절하다.
* Qr : One should take action only when sure it is morally right.
 - 도덕적으로 옳다고 확실이 들 때만 행동을 취해야 한다.
* Qs : It is wise to flatter important people.
 - 중요한 사람들에게 아첨하는 것이 현명하다.

In [4]:
print(f'Columns: {train.columns}')

Columns: Index(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA', 'QeE',
       'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE', 'QjA', 'QjE',
       'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA', 'QnE', 'QoA', 'QoE',
       'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE', 'QsA', 'QsE', 'QtA', 'QtE',
       'age_group', 'education', 'engnat', 'familysize', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'voted', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'],
      dtype='object')


In [5]:
# 전처리
# 일부 컬럼 영한 번역
train_prep = train.copy()
train_prep = train_prep.rename(columns={'age_group': '연령군',
    'education': '교육수준', 'familysize': '가족수', 'gender': '성별',
    'hand': '손잡이구분', 'married': '결혼여부', 'race': '인종', 
    'engnat': '영어모국어여부', 'religion': '종교', 'urban': '유년기거주지',
    'voted': '투표여부'})

In [6]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지', '투표여부'])
def cnt_by_column(column_name):
    df = train_prep.groupby(column_name).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', data=df, ax=ax)
    plt.show()
    return df.set_index(column_name).T

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [7]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_prep.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [15]:
train_not_teens = train_prep.query('연령군 != "10s"')

In [101]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_not_teens_not_white.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [28]:
# 10대 투표 잘 안 함
# 인종 == "White" 투표 많이 함
train_not_teens_not_white = train_prep.query('연령군 != "10s"').query('인종 != "White"')
train_not_teens_white = train_prep.query('연령군 != "10s"').query('인종 == "White"')

In [65]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_not_teens_white.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [155]:
# 전처리 2차
train_prep_2nd = train_prep.copy()
train_prep_2nd['연령군'] = train_prep_2nd.연령군.str.extract('(\d{2})').astype(int)
# train_prep_2nd['미성년자여부'] = (train_prep_2nd.연령군 == 10).astype(int) # Boolean
train_prep_2nd['결혼여부'] = train_prep_2nd.결혼여부.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
train_prep_2nd['교육수준'] = train_prep_2nd.교육수준.replace(0, 3)
train_prep_2nd['유년기거주지'] = train_prep_2nd.교육수준.replace(0, 2)
train_prep_2nd['영어모국어여부'] = train_prep_2nd.영어모국어여부.replace(0, 1).replace(2, 0).astype(int) # Boolean
train_prep_2nd['손잡이구분'] = train_prep_2nd.손잡이구분.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
train_prep_2nd['백인여부'] = (train_prep_2nd.인종 == 'White').astype(int) # Boolean
train_prep_2nd['아시안여부'] = (train_prep_2nd.인종 == 'Asian').astype(int) # Boolean
train_prep_2nd['가족수'] = np.fmin(train_prep_2nd.가족수, 10)
train_prep_2nd['기독교여부'] = (train_prep_2nd.종교.str.contains('Christian')).astype(int) # Boolean
train_prep_2nd['성별'] = (train_prep_2nd.성별 == 'Male').astype(int) # Boolean
train_prep_2nd['투표여부'] = train_prep_2nd.투표여부.replace(2, 0).astype(int) # Boolean

In [114]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '백인여부', '아시안여부', '기독교여부', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_prep_2nd.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [185]:
train_prep_2nd.columns

Index(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA', 'QeE',
       'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE', 'QjA', 'QjE',
       'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA', 'QnE', 'QoA', 'QoE',
       'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE', 'QsA', 'QsE', 'QtA', 'QtE',
       '연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교',
       'tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08', 'tp09',
       'tp10', '유년기거주지', '투표여부', 'wf_01', 'wf_02', 'wf_03', 'wr_01', 'wr_02',
       'wr_03', 'wr_04', 'wr_05', 'wr_06', 'wr_07', 'wr_08', 'wr_09', 'wr_10',
       'wr_11', 'wr_12', 'wr_13', '백인여부', '아시안여부', '기독교여부'],
      dtype='object')

In [243]:
columns_selected = ['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '백인여부', '아시안여부', '기독교여부', '유년기거주지']
X = train_prep_2nd[columns_selected]
y = train_prep_2nd['투표여부']

In [213]:
from sklearn.model_selection import train_test_split
idx_train, idx_test = train_test_split(train_prep_2nd.index, train_size=.8, random_state=20201025)
X_train, y_train, X_test, y_test = X.loc[idx_train], y.loc[idx_train], X.loc[idx_test], y.loc[idx_test]

In [241]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
model = LGBMClassifier(n_estimators=400, random_state=20201025)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6905676951795322

In [257]:
# 예측 데이터 전처리 1, 2차
test_prep = test_x.copy()
test_prep = test_prep.rename(columns={'age_group': '연령군',
    'education': '교육수준', 'familysize': '가족수', 'gender': '성별',
    'hand': '손잡이구분', 'married': '결혼여부', 'race': '인종', 
    'engnat': '영어모국어여부', 'religion': '종교', 'urban': '유년기거주지'})

test_prep_2nd = test_prep.copy()
test_prep_2nd['연령군'] = test_prep_2nd.연령군.str.extract('(\d{2})').astype(int)
test_prep_2nd['결혼여부'] = test_prep_2nd.결혼여부.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
test_prep_2nd['교육수준'] = test_prep_2nd.교육수준.replace(0, 3)
test_prep_2nd['유년기거주지'] = test_prep_2nd.교육수준.replace(0, 2)
test_prep_2nd['영어모국어여부'] = test_prep_2nd.영어모국어여부.replace(0, 1).replace(2, 0).astype(int) # Boolean
test_prep_2nd['손잡이구분'] = test_prep_2nd.손잡이구분.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
test_prep_2nd['백인여부'] = (test_prep_2nd.인종 == 'White').astype(int) # Boolean
test_prep_2nd['아시안여부'] = (test_prep_2nd.인종 == 'Asian').astype(int) # Boolean
test_prep_2nd['가족수'] = np.fmin(test_prep_2nd.가족수, 10)
test_prep_2nd['기독교여부'] = (test_prep_2nd.종교.str.contains('Christian')).astype(int) # Boolean
test_prep_2nd['성별'] = (test_prep_2nd.성별 == 'Male').astype(int) # Boolean

In [260]:
# 출력하기
now = datetime.now().strftime('%Y%m%d%H%M%S')
submission['voted'] = 1-model.predict(test_prep_2nd[columns_selected])
submission.to_csv(f'result/submission_{now}.csv')