In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [2]:
import os
os.chdir('./open')
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [3]:
# 직업유형 삭제
train=train.drop('occyp_type', axis=1)
test=test.drop('occyp_type', axis=1)

In [4]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
test['gender'] = test['gender'].replace(['F','M'],[0,1])
print('gender :')
print(train['gender'].value_counts())
print('--------------')

# 차량 유무
print('Having a car or not : ')
train['car'] = train['car'].replace(['N','Y'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
print(train['car'].value_counts())
print('--------------')

# 자가 유무
print('Having house reality or not: ')
train['reality'] = train['reality'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])
print(train['reality'].value_counts())
print('--------------')

#개인 핸드폰 유무
print('Having a phone or not: ')
print(train['phone'].value_counts())
print('--------------')
      
#이메일 유무
print('Having a email or not: ')
print(train['email'].value_counts())
print('--------------')
      
# 회사용 핸드폰유무
print('Having a work phone or not: ')
print(train['work_phone'].value_counts())
print('--------------')

gender :
0    17697
1     8760
Name: gender, dtype: int64
--------------
Having a car or not : 
0    16410
1    10047
Name: car, dtype: int64
--------------
Having house reality or not: 
1    17830
0     8627
Name: reality, dtype: int64
--------------
Having a phone or not: 
0    18672
1     7785
Name: phone, dtype: int64
--------------
Having a email or not: 
0    24042
1     2415
Name: email, dtype: int64
--------------
Having a work phone or not: 
0    20511
1     5946
Name: work_phone, dtype: int64
--------------


In [5]:
train.loc[train['child_num'] >= 2,'child_num']=2
test.loc[test['child_num']>=2, 'child_num']=2

In [6]:
count, bin_dividers =np.histogram(train['income_total'], bins=7)
bin_names=['소득'+str(i) for i in range(7) ]
#bin_dividers는 train기준!!
train['income_total']=pd.cut(x=train['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test['income_total']=pd.cut(x=test['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

In [7]:
test

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
0,26457,1,1,0,0,소득0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,-21990,365243,1,0,1,0,2.0,-60.0
1,26458,0,0,1,0,소득0,State servant,Higher education,Married,House / apartment,-18964,-8671,1,0,1,0,2.0,-36.0
2,26459,0,0,1,0,소득0,Working,Secondary / secondary special,Married,House / apartment,-15887,-217,1,1,1,0,2.0,-40.0
3,26460,1,1,0,0,소득0,Commercial associate,Secondary / secondary special,Married,House / apartment,-19270,-2531,1,1,0,0,2.0,-41.0
4,26461,0,1,1,0,소득0,State servant,Higher education,Married,House / apartment,-17822,-9385,1,1,0,0,2.0,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,36452,0,1,1,0,소득0,Working,Incomplete higher,Married,House / apartment,-18593,-5434,1,1,1,0,2.0,-19.0
9996,36453,1,1,1,0,소득0,Working,Secondary / secondary special,Civil marriage,House / apartment,-10886,-1315,1,1,0,0,2.0,-34.0
9997,36454,0,0,1,0,소득1,Working,Secondary / secondary special,Married,House / apartment,-21016,-14018,1,0,0,0,2.0,-55.0
9998,36455,0,1,0,0,소득0,Commercial associate,Secondary / secondary special,Married,House / apartment,-16541,-1085,1,0,1,0,2.0,-33.0


In [8]:
print(train['income_type'].unique())
print(train['edu_type'].unique())
print(train['family_type'].unique())
print(train['house_type'].unique())

['Commercial associate' 'Working' 'State servant' 'Pensioner' 'Student']
['Higher education' 'Secondary / secondary special' 'Incomplete higher'
 'Lower secondary' 'Academic degree']
['Married' 'Civil marriage' 'Separated' 'Single / not married' 'Widow']
['Municipal apartment' 'House / apartment' 'With parents'
 'Co-op apartment' 'Rented apartment' 'Office apartment']


In [9]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
train['income_type']=label_encoder.fit_transform(train['income_type'])
test['income_type']=label_encoder.transform(test['income_type'])
########################################################################
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
test['edu_type']=label_encoder.transform(test['edu_type'])
########################################################################
train['family_type']=label_encoder.fit_transform(train['family_type'])
test['family_type']=label_encoder.transform(test['family_type'])
########################################################################
train['house_type']=label_encoder.fit_transform(train['house_type'])
test['house_type']=label_encoder.transform(test['house_type'])
########################################################################
train['income_total']=label_encoder.fit_transform(train['income_total'])
test['income_total']=label_encoder.fit_transform(test['income_total'])

In [10]:
#minus 변경하고
#구간화 함수
def make_bin(variable, n):
    train[variable]=-train[variable]
    test[variable]=-test[variable]
    count, bin_dividers =np.histogram(train[variable], bins=n) #train의 구간화를 적용
    bin_names=[str(i) for i in range(n)]
    train[variable]=pd.cut(x=train[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable]=pd.cut(x=test[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable].fillna(str(0), inplace=True) #test에는 없는 것을 임의의 값으로 채움
    ##########################################################
    train[variable]=label_encoder.fit_transform(train[variable])
    test[variable]=label_encoder.transform(test[variable])

In [11]:
train.columns

Index(['index', 'gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'family_size', 'begin_month', 'credit'],
      dtype='object')

In [12]:
make_bin('DAYS_BIRTH', n=8)
make_bin('DAYS_EMPLOYED', n=6)
make_bin('begin_month', n=4)

In [13]:
test

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
0,26457,1,1,0,0,0,1,4,0,1,6,0,1,0,1,0,2.0,3
1,26458,0,0,1,0,0,2,1,1,1,5,1,1,0,1,0,2.0,2
2,26459,0,0,1,0,0,4,4,1,1,3,1,1,1,1,0,2.0,2
3,26460,1,1,0,0,0,0,4,1,1,5,1,1,1,0,0,2.0,2
4,26461,0,1,1,0,0,2,1,1,1,4,1,1,1,0,0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,36452,0,1,1,0,0,4,2,1,1,4,1,1,1,1,0,2.0,1
9996,36453,1,1,1,0,0,4,4,0,1,1,1,1,1,0,0,2.0,2
9997,36454,0,0,1,0,1,4,4,1,1,6,1,1,0,0,0,2.0,3
9998,36455,0,1,0,0,0,0,4,1,1,4,1,1,0,1,0,2.0,2


In [14]:
train.shape

(26457, 19)

In [15]:
test.shape

(10000, 18)

In [16]:
train_x=train.drop('credit', axis=1)
train_y=train[['credit']]
test_x=test

In [17]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                    stratify=train_y, test_size=0.25,
                                                    random_state = 42)

print("Train set: ")
print(X_train.shape)
print(y_train.shape)
print("===========")
print("Validation set: ")
print(X_val.shape)
print(y_val.shape)


clf=XGBClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

Train set: 
(19842, 18)
(19842, 1)
Validation set: 
(6615, 18)
(6615, 1)
log_loss: 0.874686371140289


In [18]:
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)  
xgb_wrapper.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=400, n_jobs=2,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [19]:
w_preds = xgb_wrapper.predict(X_val)
w_pred_proba = xgb_wrapper.predict_proba(X_val)[:, 1]

In [20]:
from sklearn.metrics import accuracy_score # 정확도
from sklearn.metrics import precision_score, recall_score # 정밀도, 재현률
from sklearn.metrics import f1_score

def get_clf_eval(y_test, pred=None, pred_proba =None):
    accuracy = accuracy_score(y_val, pred)
    precision = precision_score(y_val, pred)
    recall = recall_score(y_val, pred)
    f1 = f1_score(y_val, pred)
    print('정확도: {0: .4f}, 정밀도:{1: .4f}, 재현율: {2: .4f}, F1:{3: .4f}'.format(accuracy, precision, recall, f1))

In [21]:
# 조기 종료(early stopping)을 제공
# verbose 학습의 결과값들을 logloss값을 출력해준다
# loss 값이 작아질수록 최적의 값을 찾아간다 어느순간에 그 값들이 변동이 없다 
# 100개까지 손실함수를 계산하면서 로스가 크게 변화가 없으면 더이상 학습하지않고 멈춰서 학습을 종료한다.
xgb_wrapper = XGBClassifier(n_estimators=1000, learning_rate=0.002, max_depth=7)  
evals = [(X_val, y_val)]
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_set=evals, verbose=True)


ws100_preds = xgb_wrapper.predict(X_val)
ws100_pred_proba = xgb_wrapper.predict_proba(X_val)[:, 1]

[0]	validation_0-mlogloss:1.09787
[1]	validation_0-mlogloss:1.09713
[2]	validation_0-mlogloss:1.09639
[3]	validation_0-mlogloss:1.09566
[4]	validation_0-mlogloss:1.09493
[5]	validation_0-mlogloss:1.09420
[6]	validation_0-mlogloss:1.09347
[7]	validation_0-mlogloss:1.09275
[8]	validation_0-mlogloss:1.09203
[9]	validation_0-mlogloss:1.09130
[10]	validation_0-mlogloss:1.09059
[11]	validation_0-mlogloss:1.08987
[12]	validation_0-mlogloss:1.08916
[13]	validation_0-mlogloss:1.08845
[14]	validation_0-mlogloss:1.08775
[15]	validation_0-mlogloss:1.08704
[16]	validation_0-mlogloss:1.08634
[17]	validation_0-mlogloss:1.08563
[18]	validation_0-mlogloss:1.08494
[19]	validation_0-mlogloss:1.08424
[20]	validation_0-mlogloss:1.08355
[21]	validation_0-mlogloss:1.08286
[22]	validation_0-mlogloss:1.08217
[23]	validation_0-mlogloss:1.08148
[24]	validation_0-mlogloss:1.08080
[25]	validation_0-mlogloss:1.08012
[26]	validation_0-mlogloss:1.07944
[27]	validation_0-mlogloss:1.07876
[28]	validation_0-mlogloss:1.0

[224]	validation_0-mlogloss:0.98030
[225]	validation_0-mlogloss:0.97994
[226]	validation_0-mlogloss:0.97958
[227]	validation_0-mlogloss:0.97923
[228]	validation_0-mlogloss:0.97887
[229]	validation_0-mlogloss:0.97852
[230]	validation_0-mlogloss:0.97816
[231]	validation_0-mlogloss:0.97781
[232]	validation_0-mlogloss:0.97746
[233]	validation_0-mlogloss:0.97711
[234]	validation_0-mlogloss:0.97676
[235]	validation_0-mlogloss:0.97641
[236]	validation_0-mlogloss:0.97606
[237]	validation_0-mlogloss:0.97572
[238]	validation_0-mlogloss:0.97537
[239]	validation_0-mlogloss:0.97503
[240]	validation_0-mlogloss:0.97469
[241]	validation_0-mlogloss:0.97434
[242]	validation_0-mlogloss:0.97401
[243]	validation_0-mlogloss:0.97366
[244]	validation_0-mlogloss:0.97332
[245]	validation_0-mlogloss:0.97299
[246]	validation_0-mlogloss:0.97265
[247]	validation_0-mlogloss:0.97232
[248]	validation_0-mlogloss:0.97198
[249]	validation_0-mlogloss:0.97165
[250]	validation_0-mlogloss:0.97132
[251]	validation_0-mlogloss:

[452]	validation_0-mlogloss:0.92087
[453]	validation_0-mlogloss:0.92068
[454]	validation_0-mlogloss:0.92050
[455]	validation_0-mlogloss:0.92032
[456]	validation_0-mlogloss:0.92014
[457]	validation_0-mlogloss:0.91997
[458]	validation_0-mlogloss:0.91979
[459]	validation_0-mlogloss:0.91961
[460]	validation_0-mlogloss:0.91943
[461]	validation_0-mlogloss:0.91926
[462]	validation_0-mlogloss:0.91908
[463]	validation_0-mlogloss:0.91890
[464]	validation_0-mlogloss:0.91873
[465]	validation_0-mlogloss:0.91855
[466]	validation_0-mlogloss:0.91838
[467]	validation_0-mlogloss:0.91820
[468]	validation_0-mlogloss:0.91803
[469]	validation_0-mlogloss:0.91786
[470]	validation_0-mlogloss:0.91768
[471]	validation_0-mlogloss:0.91751
[472]	validation_0-mlogloss:0.91734
[473]	validation_0-mlogloss:0.91717
[474]	validation_0-mlogloss:0.91700
[475]	validation_0-mlogloss:0.91683
[476]	validation_0-mlogloss:0.91665
[477]	validation_0-mlogloss:0.91649
[478]	validation_0-mlogloss:0.91632
[479]	validation_0-mlogloss:

[680]	validation_0-mlogloss:0.89081
[681]	validation_0-mlogloss:0.89072
[682]	validation_0-mlogloss:0.89062
[683]	validation_0-mlogloss:0.89053
[684]	validation_0-mlogloss:0.89044
[685]	validation_0-mlogloss:0.89035
[686]	validation_0-mlogloss:0.89026
[687]	validation_0-mlogloss:0.89017
[688]	validation_0-mlogloss:0.89008
[689]	validation_0-mlogloss:0.88999
[690]	validation_0-mlogloss:0.88989
[691]	validation_0-mlogloss:0.88981
[692]	validation_0-mlogloss:0.88972
[693]	validation_0-mlogloss:0.88963
[694]	validation_0-mlogloss:0.88954
[695]	validation_0-mlogloss:0.88945
[696]	validation_0-mlogloss:0.88937
[697]	validation_0-mlogloss:0.88928
[698]	validation_0-mlogloss:0.88919
[699]	validation_0-mlogloss:0.88911
[700]	validation_0-mlogloss:0.88902
[701]	validation_0-mlogloss:0.88893
[702]	validation_0-mlogloss:0.88885
[703]	validation_0-mlogloss:0.88876
[704]	validation_0-mlogloss:0.88868
[705]	validation_0-mlogloss:0.88859
[706]	validation_0-mlogloss:0.88851
[707]	validation_0-mlogloss:

[908]	validation_0-mlogloss:0.87563
[909]	validation_0-mlogloss:0.87558
[910]	validation_0-mlogloss:0.87553
[911]	validation_0-mlogloss:0.87549
[912]	validation_0-mlogloss:0.87544
[913]	validation_0-mlogloss:0.87539
[914]	validation_0-mlogloss:0.87535
[915]	validation_0-mlogloss:0.87530
[916]	validation_0-mlogloss:0.87526
[917]	validation_0-mlogloss:0.87521
[918]	validation_0-mlogloss:0.87517
[919]	validation_0-mlogloss:0.87512
[920]	validation_0-mlogloss:0.87508
[921]	validation_0-mlogloss:0.87503
[922]	validation_0-mlogloss:0.87498
[923]	validation_0-mlogloss:0.87494
[924]	validation_0-mlogloss:0.87489
[925]	validation_0-mlogloss:0.87485
[926]	validation_0-mlogloss:0.87480
[927]	validation_0-mlogloss:0.87476
[928]	validation_0-mlogloss:0.87472
[929]	validation_0-mlogloss:0.87467
[930]	validation_0-mlogloss:0.87463
[931]	validation_0-mlogloss:0.87459
[932]	validation_0-mlogloss:0.87454
[933]	validation_0-mlogloss:0.87450
[934]	validation_0-mlogloss:0.87446
[935]	validation_0-mlogloss:

In [22]:
os.chdir('../baseline')

In [23]:
submission.to_csv('submission.csv', index=False)