# 신용카드 사용자 연체 예측 (1.Data Engineering)

In [3]:
#!pip install vecstack

# 결측치 채우기 (occyp feature)

In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn import ensemble
from vecstack import StackingTransformer
from sklearn.svm import SVC, SVR
from sklearn import ensemble
from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import classification_report

from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier 
from vecstack import stacking

import matplotlib.pyplot as plt
plt.rc("font", family="Malgun Gothic")
plt.rc("axes", unicode_minus = False)

In [23]:
train_data = pd.read_csv('train.csv')
train_data.set_index('index', inplace = True)
train_data.drop('FLAG_MOBIL', axis = 1, inplace = True)

In [24]:
# train_data.groupby('occyp_type').mean()
train_data.loc[(train_data['DAYS_EMPLOYED'] > 0), 'occyp_type'] = 'Unemployed'
occyp_type_null = train_data[train_data['occyp_type'].isna()]

In [37]:
occyp_type_dict = {'Laborers':0,'Core staff':1,'Sales staff':2,'Managers':3,
                   'Drivers':4,'High skill tech staff':5,'Accountants':6,
                   'Medicine staff':7,'Cooking staff':8,'Security staff':9,
                   'Cleaning staff':10,'Private service staff':11,'Low-skill Laborers':12,
                   'Waiters/barmen staff':13,'Secretaries':14,'Realty agents':15,
                   'HR staff':16,'IT staff':17,'Unemployed':18}

train_data['occyp_type'] = train_data['occyp_type'].map(occyp_type_dict)

In [38]:
null_x = occyp_type_null.drop(['occyp_type', 'credit'], axis = 1)
null_y = occyp_type_null['occyp_type']
null_y

index
0        NaN
8        NaN
19       NaN
20       NaN
23       NaN
        ... 
26433    NaN
26435    NaN
26437    NaN
26449    NaN
26453    NaN
Name: occyp_type, Length: 3733, dtype: object

In [39]:
occyp_type_not_null = train_data[train_data['occyp_type'].notna()]

### 하이퍼파라미터 튜닝

In [40]:
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier 
from vecstack import stacking
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [41]:
x = occyp_type_not_null.drop(['occyp_type', 'credit'], axis = 1)
y = occyp_type_not_null['occyp_type']

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3, random_state=0)

numeric_features = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']
numeric_transformer = StandardScaler()

categorical_features = ['gender', 'car', 'reality', 'income_type', 	'edu_type', 'family_type', 'house_type', 'work_phone', 'phone','email'] 
categorical_transformer = OneHotEncoder(categories='auto')

pass_through = ['child_num', 'family_size']

preprocessor = ColumnTransformer(
    transformers = [ ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features), 
        ('passthrough', 'passthrough', pass_through)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)
x_transformed = preprocessor_pipe.transform(x)
null_x_transformed = preprocessor_pipe.transform(null_x)

In [42]:
null_x.head(2)

Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,0,0,0,2.0,-6.0
8,M,Y,Y,1,180000.0,Commercial associate,Higher education,Married,House / apartment,-15131,-1466,0,0,1,3.0,-38.0


In [43]:
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

space={'max_depth': hp.quniform("max_depth", 3, 8, 1),
       'learning_rate': hp.quniform ('learning_rate', 0.01, 0.011, 0.001),
       'reg_alpha' : hp.choice('reg_alpha', reg_candidate),
       'reg_lambda' : hp.choice('reg_lambda', reg_candidate),
       'subsample': hp.quniform('subsample', 0.6, 1, 0.05),
       'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.05),
       'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
       'n_estimators': hp.quniform('n_estimators', 1000, 3000, 300)}  

def hyperparameter_tuning(space):
    model=XGBClassifier(n_estimators = int(space['n_estimators']), 
                       max_depth = int(space['max_depth']), 
                       learning_rate = space['learning_rate'],
                       reg_alpha = space['reg_alpha'],
                       reg_lambda = space['reg_lambda'],
                       subsample = space['subsample'],
                       colsample_bytree = space['colsample_bytree'], 
                       min_child_weight = int(space['min_child_weight']),
                       random_state=42)
    
    evaluation = [(x_train_transformed, y_train), (x_test_transformed, y_test)]
    
    model.fit(x_train_transformed, y_train,
              eval_set=evaluation, 
              eval_metric='auc',
              verbose=0)
    
    accuracy = accuracy_score(model.predict(x_test_transformed), y_test)
    # 평가 방식 선정
    return {'loss': 1 - accuracy, 'status': STATUS_OK, 'model': model}    

In [44]:
# Trials 객체 선언.
trials = Trials()
# best에 최적의 하이퍼 파라미터를 return.
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=20, # 최대 반복 횟수 지정.
            trials=trials)

# 최적화된 결과를 int로 변환해야하는 파라미터는 타입 변환.
best['max_depth'] = int(best['max_depth'])
best['min_child_weight'] = int(best['min_child_weight'])
best['n_estimators'] = int(best['n_estimators'])
best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
print(best)

100%|█████| 20/20 [2:38:52<00:00, 476.60s/trial, best loss: 0.14447051921384568]
{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.011, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 2400, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'subsample': 0.6000000000000001}


In [45]:
trials.best_trial['result']

{'loss': 0.14447051921384568,
 'status': 'ok',
 'model': XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1,
               colsample_bytree=0.7000000000000001, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
               grow_policy='depthwise', importance_type=None,
               interaction_constraints='', learning_rate=0.011, max_bin=256,
               max_cat_to_onehot=4, max_delta_step=0, max_depth=8, max_leaves=0,
               min_child_weight=3, missing=nan, monotone_constraints='()',
               n_estimators=2400, n_jobs=0, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=42,
               reg_alpha=0.01, ...)}

In [46]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.7000000000000001,
               enable_categorical=False, gamma=0, gpu_id=-1,
               importance_type=None, interaction_constraints='',
               learning_rate=0.011, max_delta_step=0, max_depth=8,
               min_child_weight=5, monotone_constraints='()',
               n_estimators=2100, n_jobs=8, num_parallel_tree=1,
               objective='multi:softprob', predictor='auto', random_state=42,
               reg_alpha=0.0001, reg_lambda=0.0001, scale_pos_weight=None,
               subsample=0.65, tree_method='exact', validate_parameters=1,
               verbosity=None)

In [None]:
model.fit(x_transformed, y)

In [None]:
y_predict = model.predict(null_x_transformed)

In [None]:
# train_data
train_data2 = train_data.copy()

j = 0
for i in range(26457):
    if str(train_data2.loc[i, 'occyp_type']) == 'nan':
        train_data2.loc[i, 'occyp_type'] = y_predict[j]
        j += 1

In [None]:
train_data2.to_csv('occup_filled.csv')

# Data Engineering

In [29]:
train_occ = pd.read_csv('occup_filled.csv')

In [30]:
train_occ.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,0,0,0,Laborers,2,-6,1
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,0,0,1,Laborers,3,-5,1
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,0,1,0,Managers,2,-22,2
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,0,1,0,Sales staff,2,-37,0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,0,0,0,Managers,2,-26,2


In [31]:
train_occ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     26457 non-null  object 
 16  family_size    26457 non-null  int64  
 17  begin_month    26457 non-null  int64  
 18  credit

In [32]:
train_occ = train_occ.drop(['index'], axis=1)

In [33]:
train_occ.loc[(train_occ['DAYS_EMPLOYED'] > 0), 'occyp_type'] = 'Unemployed'

In [34]:
# categorical feature dict 생성
gender_dict = {'F':0, 'M':1}
car_dict = {'N':0 , 'Y':1}
realty_dict = {'N':0 , 'Y':1}
income_type_dict = {'Student': 0,
                    'State servant': 1,
                    'Pensioner': 2,
                    'Commercial associate': 3,
                    'Working': 4}
edu_type_dict = {'Secondary / secondary special': 0,
                 'Higher education': 1,
                 'Incomplete higher': 2,
                 'Lower secondary': 3,
                 'Academic degree': 4}
family_type_dict = {'Married': 0,
                    'Single / not married': 1,
                    'Civil marriage': 2,
                    'Separated': 3,
                    'Widow': 4}
house_type_dict = {'House / apartment': 0,
                      'With parents': 1,
                      'Municipal apartment': 2,
                      'Rented apartment': 3,
                      'Office apartment': 4,
                      'Co-op apartment': 5}
occyp_type_dict = {'Laborers':0,'Core staff':1,'Sales staff':2,'Managers':3,
                   'Drivers':4,'High skill tech staff':5,'Accountants':6,
                   'Medicine staff':7,'Cooking staff':8,'Security staff':9,
                   'Cleaning staff':10,'Private service staff':11,'Low-skill Laborers':12,
                   'Waiters/barmen staff':13,'Secretaries':14,'Realty agents':15,
                   'HR staff':16,'IT staff':17,'Unemployed':18}

# categorical feature 가공 (str -> num)
train_occ['gender'] = train_occ['gender'].map(gender_dict)
# ============================================================
train_occ['car'] = train_occ['car'].map(car_dict)
# ============================================================
train_occ['reality'] = train_occ['reality'].map(realty_dict)
# ============================================================
train_occ['income_type'] = train_occ['income_type'].map(income_type_dict)
# ============================================================
train_occ['edu_type'] = train_occ['edu_type'].map(edu_type_dict)
# ============================================================
train_occ['family_type'] = train_occ['family_type'].map(family_type_dict)
# ============================================================
train_occ['house_type'] = train_occ['house_type'].map(house_type_dict)
# ============================================================
train_occ['occyp_type'] = train_occ['occyp_type'].map(occyp_type_dict)

# numerica feature 가공
# income_total -> scaler에 넣으면 될 듯.
# train_df['income_total'] = np.log1p(train_df['income_total']).astype(np.float64)
# test_df['income_total'] = np.log1p(test_df['income_total']).astype(np.float64)
# ============================================================
# DAYS_BIRTH : DAYS -> YEAR
train_occ['Age'] = round(abs(train_occ['DAYS_BIRTH'])/365.5,0).astype(np.int32)
# ============================================================
# DAYS_EMPLOYED : DAYS -> YEAR
train_occ['worked_year'] = round(abs(train_occ['DAYS_EMPLOYED'])/365.5,0).astype(np.int32)

# 가공 끝난 feature 삭제
train_occ = train_occ.drop(['DAYS_BIRTH','DAYS_EMPLOYED'], axis=1)

In [35]:
train_occ.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,email,occyp_type,family_size,begin_month,credit,Age,worked_year
0,0,0,0,0,202500.0,3,1,0,2,0,0,0,0,2,-6,1,38,13
1,0,0,1,1,247500.0,3,0,2,0,0,0,1,0,3,-5,1,31,4
2,1,1,1,0,450000.0,4,1,0,0,0,1,0,3,2,-22,2,52,12
3,0,0,1,0,202500.0,3,0,0,0,0,1,0,2,2,-37,0,41,6
4,0,1,1,0,157500.0,1,1,0,0,0,0,0,3,2,-26,2,41,6


In [36]:
train_occ.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month', 'credit', 'Age',
       'worked_year'],
      dtype='object')

In [37]:
train_occ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gender        26457 non-null  int64  
 1   car           26457 non-null  int64  
 2   reality       26457 non-null  int64  
 3   child_num     26457 non-null  int64  
 4   income_total  26457 non-null  float64
 5   income_type   26457 non-null  int64  
 6   edu_type      26457 non-null  int64  
 7   family_type   26457 non-null  int64  
 8   house_type    26457 non-null  int64  
 9   work_phone    26457 non-null  int64  
 10  phone         26457 non-null  int64  
 11  email         26457 non-null  int64  
 12  occyp_type    26457 non-null  int64  
 13  family_size   26457 non-null  int64  
 14  begin_month   26457 non-null  int64  
 15  credit        26457 non-null  int64  
 16  Age           26457 non-null  int32  
 17  worked_year   26457 non-null  int32  
dtypes: float64(1), int32(2), i

In [38]:
train_occ.isnull().sum()

gender          0
car             0
reality         0
child_num       0
income_total    0
income_type     0
edu_type        0
family_type     0
house_type      0
work_phone      0
phone           0
email           0
occyp_type      0
family_size     0
begin_month     0
credit          0
Age             0
worked_year     0
dtype: int64

In [39]:
train_occ.to_csv('preprocessing.csv', mode='w')

# Preprocessing

In [None]:
df = pd.read_csv('preprocessing.csv')

In [None]:
df = df.drop(['Unnamed: 0'],axis=1)
df_x = df.drop(['credit'],axis=1)
df_y = df[['credit']]

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df_x, df_y, test_size=0.3, random_state=0)

In [None]:
# preprocessing
# feature classification (categorical vs numerical)

categorical_features = ['gender', 'car', 'reality', 'income_type',
                        'edu_type', 'family_type', 'house_type',
                        'work_phone', 'phone','email', 'occyp_type']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')

numerical_features = ['income_total', 'begin_month', 'Age', 'worked_year']
numerical_transformer = MinMaxScaler()

pass_through = ['child_num', 'family_size']

pass

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features),
                  ('num', numerical_transformer, numerical_features),
                  ('passthrough', 'passthrough', pass_through)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
preprocessor_pipe.fit(train_x)

train_x_transformed = preprocessor_pipe.transform(train_x)
test_x_transformed = preprocessor_pipe.transform(test_x)