In [283]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from ipywidgets import interact
from datetime import datetime

In [284]:
# !pip install pycaret

In [285]:
# 환경설정
sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
if not any([s == 'result' for s in os.listdir('.')]): os.mkdir('result')

In [286]:
# 데이터 불러오기
train = pd.read_csv('data/train.csv').set_index('index')
test = pd.read_csv('data/test_x.csv').set_index('index')
submission = pd.read_csv('data/sample_submission.csv').set_index('index')

* Qb : The biggest difference between most criminals and other people is that the criminals are stupid enough to get caught.
 - 대부분의 범죄자들과 다른 사람들의 가장 큰 차이점은 범죄자들은 잡힐 만큼 어리석다는 것이다.
* Qc : Anyone who completely trusts anyone else is asking for trouble.
 - 다른 사람을 너무 믿는 사람은 문제를 스스로 자초하는 것이다.
* Qe : P.T. Barnum was wrong when he said that there's a sucker born every minute.
 - P.T. Barnum(미국 정치인)은 매 순간 어리버리가 태어난다고 했는데 그것은 틀렸다.
* Qf : There is no excuse for lying to someone else.
 - 거짓말 하는 것은 변명의 여지가 없다.
* Qh : Most people forget more easily the death of their parents than the loss of their property.
 - 대부분 사람들은 재산상 손해를 입는 것보다 부모의 죽음을 더 쉽게 잊는다.
* Qj : It is safest to assume that all people have a vicious streak and it will come out when they are given a chance.
 - 모든 사람들은 악랄한 기질을 가지고 있으며, 기회가 되면 그 기질이 나타난다 보는게 맞다.
* Qk : All in all, it is better to be humble and honest than to be important and dishonest.
 - 대체로, 거만하거나 부정직한 것보다는 겸손하고 정직한 것이 낫다.
* Qm : It is hard to get ahead without cutting corners here and there.
 - 돈을 아끼지 않고서는 출세하기 어렵다.
* Qo : The best way to handle people is to tell them what they want to hear.
 - 사람들 다루는 가장 좋은 방법은 그들이 듣고 싶은 말을 하는 것이다.
* Qq : Most people are basically good and kind.
 - 대부분의 사람들은 기본적으로 착하고 친절하다.
* Qr : One should take action only when sure it is morally right.
 - 도덕적으로 옳다고 확실이 들 때만 행동을 취해야 한다.
* Qs : It is wise to flatter important people.
 - 중요한 사람들에게 아첨하는 것이 현명하다.

In [287]:
print(f'Columns: {train.columns}')

Columns: Index(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA', 'QeE',
       'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE', 'QjA', 'QjE',
       'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA', 'QnE', 'QoA', 'QoE',
       'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE', 'QsA', 'QsE', 'QtA', 'QtE',
       'age_group', 'education', 'engnat', 'familysize', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'voted', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'],
      dtype='object')


In [288]:
# 전처리
# 일부 컬럼 영한 번역
train_prep = train.copy()
train_prep = train_prep.rename(columns={'age_group': '연령군',
    'education': '교육수준', 'familysize': '가족수', 'gender': '성별',
    'hand': '손잡이구분', 'married': '결혼여부', 'race': '인종', 
    'engnat': '영어모국어여부', 'religion': '종교', 'urban': '유년기거주지',
    'voted': '투표여부'})

In [289]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지', '투표여부'])
def cnt_by_column(column_name):
    df = train_prep.groupby(column_name).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', data=df, ax=ax)
    plt.show()
    return df.set_index(column_name).T

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [290]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '인종', '종교', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_prep.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [301]:
# 전처리 2차
train_prep_2nd = train_prep.copy()
train_prep_2nd['연령군'] = train_prep_2nd.연령군.str.extract('(\d{2})').astype(int)
# train_prep_2nd['미성년자여부'] = (train_prep_2nd.연령군 == 10).astype(int) # Boolean
train_prep_2nd['결혼여부'] = train_prep_2nd.결혼여부.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
train_prep_2nd['교육수준'] = train_prep_2nd.교육수준.replace(0, 3)
train_prep_2nd['유년기거주지'] = train_prep_2nd.교육수준.replace(0, 2)
train_prep_2nd['영어모국어여부'] = train_prep_2nd.영어모국어여부.replace(0, 1).replace(2, 0).astype(int) # Boolean
train_prep_2nd['손잡이구분'] = train_prep_2nd.손잡이구분.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
train_prep_2nd['백인여부'] = (train_prep_2nd.인종 == 'White').astype(int) # Boolean
train_prep_2nd['아시안여부'] = (train_prep_2nd.인종 == 'Asian').astype(int) # Boolean
train_prep_2nd['가족수'] = np.fmin(train_prep_2nd.가족수, 10)
train_prep_2nd['기독교여부'] = (train_prep_2nd.종교.str.contains('Christian')).astype(int) # Boolean
train_prep_2nd['성별'] = (train_prep_2nd.성별 == 'Male').astype(int) # Boolean
train_prep_2nd['투표여부'] = train_prep_2nd.투표여부.replace(2, 0).astype(int) # Boolean

In [114]:
@interact(column_name=['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '백인여부', '아시안여부', '기독교여부', '유년기거주지'])
def cnt_by_voted_column(column_name):
    df = train_prep_2nd.groupby(['투표여부', column_name]).size()
    df = df.reset_index(name='건수')
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(x=column_name, y='건수', hue='투표여부', data=df, ax=ax)
    plt.show()

interactive(children=(Dropdown(description='column_name', options=('연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이…

In [302]:
columns_selected = ['연령군', '교육수준', '영어모국어여부', '가족수', '성별', '손잡이구분', '결혼여부', '백인여부', '아시안여부', '기독교여부', '유년기거주지']
X = train_prep_2nd[columns_selected]
y = train_prep_2nd['투표여부']

In [303]:
from sklearn.model_selection import train_test_split
idx_train, idx_test = train_test_split(train_prep_2nd.index, train_size=.8, random_state=20201025)
X_train, y_train, X_test, y_test = X.loc[idx_train], y.loc[idx_train], X.loc[idx_test], y.loc[idx_test]

In [304]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
model = LGBMClassifier(n_estimators=400, random_state=20201025)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6905676951795322

In [305]:
# 예측 데이터 전처리 1, 2차
test_prep = test.copy()
test_prep = test_prep.rename(columns={'age_group': '연령군',
    'education': '교육수준', 'familysize': '가족수', 'gender': '성별',
    'hand': '손잡이구분', 'married': '결혼여부', 'race': '인종', 
    'engnat': '영어모국어여부', 'religion': '종교', 'urban': '유년기거주지'})

test_prep_2nd = test_prep.copy()
test_prep_2nd['연령군'] = test_prep_2nd.연령군.str.extract('(\d{2})').astype(int)
test_prep_2nd['결혼여부'] = test_prep_2nd.결혼여부.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
test_prep_2nd['교육수준'] = test_prep_2nd.교육수준.replace(0, 3)
test_prep_2nd['유년기거주지'] = test_prep_2nd.교육수준.replace(0, 2)
test_prep_2nd['영어모국어여부'] = test_prep_2nd.영어모국어여부.replace(0, 1).replace(2, 0).astype(int) # Boolean
test_prep_2nd['손잡이구분'] = test_prep_2nd.손잡이구분.replace(0, 1).replace(3, 2).replace(2, 0).astype(int) # Boolean
test_prep_2nd['백인여부'] = (test_prep_2nd.인종 == 'White').astype(int) # Boolean
test_prep_2nd['아시안여부'] = (test_prep_2nd.인종 == 'Asian').astype(int) # Boolean
test_prep_2nd['가족수'] = np.fmin(test_prep_2nd.가족수, 10)
test_prep_2nd['기독교여부'] = (test_prep_2nd.종교.str.contains('Christian')).astype(int) # Boolean
test_prep_2nd['성별'] = (test_prep_2nd.성별 == 'Male').astype(int) # Boolean

In [306]:
# 출력하기
now = datetime.now().strftime('%Y%m%d%H%M%S')
submission['voted'] = 1-model.predict(test_prep_2nd[columns_selected])
submission.to_csv(f'result/submission_{now}.csv')

In [307]:
from pycaret.classification import *

In [265]:
clf = setup(data = train, target = 'voted')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,2143
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 77)"
4,Missing Values,False
5,Numeric Features,41
6,Categorical Features,35
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [266]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6932,0.7634,0.6378,0.7624,0.6945,0.3913,0.3977,25.0453
1,Light Gradient Boosting Machine,0.6922,0.7633,0.6445,0.7566,0.696,0.3884,0.3936,0.861
2,CatBoost Classifier,0.6904,0.7631,0.6544,0.748,0.698,0.3834,0.387,19.3343
3,Linear Discriminant Analysis,0.6904,0.7604,0.6612,0.7441,0.7001,0.3824,0.3853,1.286
4,Extra Trees Classifier,0.6895,0.7603,0.643,0.7531,0.6937,0.3828,0.3878,4.724
5,Ada Boost Classifier,0.6885,0.7554,0.6521,0.7463,0.6959,0.3796,0.3833,6.3622
6,Extreme Gradient Boosting,0.6748,0.7436,0.6638,0.7197,0.6906,0.3491,0.3504,5.9077
7,Random Forest Classifier,0.6559,0.7099,0.6027,0.7222,0.657,0.3175,0.3227,0.4387
8,Decision Tree Classifier,0.6071,0.6036,0.6415,0.6406,0.641,0.2072,0.2073,2.0878
9,Naive Bayes,0.4538,0.5212,0.0104,0.5372,0.0205,-0.0007,-0.0029,0.2087


In [267]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6899,0.7636,0.6446,0.7528,0.6945,0.3834,0.3882
1,0.7031,0.7694,0.6543,0.7683,0.7067,0.4099,0.4153
2,0.6969,0.7693,0.6456,0.7635,0.6996,0.398,0.4037
3,0.6828,0.7538,0.6281,0.7509,0.6841,0.3706,0.3765
4,0.6978,0.769,0.65,0.7624,0.7018,0.3995,0.4047
Mean,0.6941,0.765,0.6445,0.7596,0.6973,0.3923,0.3977
SD,0.007,0.006,0.0089,0.0066,0.0077,0.0137,0.0137


In [268]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6996,0.7699,0.6522,0.764,0.7037,0.403,0.4082


In [269]:
final_model = finalize_model(blended)

In [279]:
predictions = predict_model(final_model, data = test)

In [314]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6926,0.7624,0.7517,0.6361,0.6891,0.3893,0.3947,23.7255
1,CatBoost Classifier,0.6907,0.7624,0.7368,0.6373,0.6835,0.3842,0.3881,22.6599
2,Light Gradient Boosting Machine,0.6923,0.7616,0.7525,0.6356,0.6891,0.3888,0.3943,0.9553
3,Ada Boost Classifier,0.6877,0.755,0.7333,0.6345,0.6803,0.3781,0.382,5.3774
4,Linear Discriminant Analysis,0.6838,0.7542,0.6588,0.649,0.6538,0.3629,0.363,1.9827
5,Extra Trees Classifier,0.6846,0.7531,0.6911,0.641,0.6651,0.3679,0.3688,4.299
6,Extreme Gradient Boosting,0.6727,0.7411,0.6877,0.6266,0.6557,0.3451,0.3465,7.0569
7,Random Forest Classifier,0.6431,0.7107,0.5382,0.6229,0.5774,0.2713,0.2735,0.2692
8,Naive Bayes,0.4538,0.6718,0.9857,0.4528,0.6206,-0.0013,-0.0056,0.1386
9,Decision Tree Classifier,0.6106,0.6078,0.5773,0.5695,0.5733,0.2153,0.2154,1.6801


In [282]:
now = datetime.now().strftime('%Y%m%d%H%M%S')
submission['voted'] = predictions['Score']
submission.to_csv(f'result/submission_{now}.csv')

In [None]:
# clf = setup(data = train_prep_2nd, target = '투표여부')
# best_3 = compare_models(sort = 'AUC', n_select = 3)
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
predictions = predict_model(final_model, data = test_prep_2nd)

In [None]:
now = datetime.now().strftime('%Y%m%d%H%M%S')
submission['voted'] = predictions['Score']
submission.to_csv(f'result/submission_{now}.csv')