In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
%pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 전처리를 위한 라이브러리
from sklearn.preprocessing import OrdinalEncoder , StandardScaler
from sklearn.model_selection import train_test_split

# 모델 제작을 위한 라이브러리
import tensorflow as tf
from tensorflow.keras import layers , regularizers, callbacks
from sklearn.inspection import permutation_importance
from scikeras.wrappers import KerasClassifier

In [3]:
# 데이터 입력
tr_dat = pd.read_csv('/content/drive/MyDrive/Kaggle/data/train.csv')
te_dat = pd.read_csv('/content/drive/MyDrive/Kaggle/data/test.csv')
testid = te_dat['id']

# 추가 전처리: 데이터 column 수정.
tr_dat['cb_person_cred_hist_length'] = tr_dat['cb_person_cred_hist_length'].replace(0, np.nan)  # 분모의 0을 NaN으로 대체
tr_dat['emp_per_cred'] = tr_dat['person_emp_length'] / tr_dat['cb_person_cred_hist_length']
te_dat['cb_person_cred_hist_length'] = te_dat['cb_person_cred_hist_length'].replace(0, np.nan)  # 분모의 0을 NaN으로 대체
te_dat['emp_per_cred'] = te_dat['person_emp_length'] / te_dat['cb_person_cred_hist_length']

tr_dat = tr_dat.drop('id', axis = 1)
target_values = tr_dat['loan_status']
tr_dat = tr_dat.drop('loan_status', axis = 1)
te_dat = te_dat.drop('id', axis = 1)

# 전처리1: Encoding
def cate_dat(df , encoder=OrdinalEncoder()):
    cols = df.select_dtypes(exclude=[np.number])
    for i in cols:
        df[i] = encoder.fit_transform(df[[i]])
    return df
n_traindat = cate_dat(tr_dat)
ntestdata = cate_dat(te_dat)

# 전처리2: 입력값 정규화
xtr_dat = StandardScaler().fit_transform(n_traindat)
xtest = StandardScaler().fit_transform(ntestdata)


# 전처리3: split
xtrain , xvalid , ytrain , yvalid = train_test_split(xtr_dat , target_values ,train_size=0.7 ,test_size=0.3 , random_state=99)

In [17]:
# 모델 생성: Regularization, Batchnormalization, EarlyStop
tf.random.set_seed(1234)

def create_model(input_shape):
    model = tf.keras.Sequential([
      tf.keras.Input(shape = input_shape),                  ## Input layer with no, of features as units(neurons)
      layers.Dense(12  , activation='relu' , kernel_regularizer=regularizers.l1(0.01)),   ## 1 Hidden Layer
      layers.BatchNormalization(), ## Not a layer just to normalize the neural network
      layers.Dense(1 , activation='sigmoid')   ## Output layer
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False) ,  metrics=['auc'])

    return model

In [18]:
# KerasClassifier로 래핑
nn = KerasClassifier(model= create_model, model__input_shape = (12, ))

earlystop1 = callbacks.EarlyStopping(monitor='val_loss', mode='auto')
earlystop2 = callbacks.EarlyStopping(monitor='val_auc' , mode='auto')

# 학습된 모델
train_nn = nn.fit(xtrain , ytrain , validation_data=(xvalid , yvalid) ,epochs=10 , callbacks=[earlystop1 , earlystop2])

Epoch 1/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - auc: 0.7472 - loss: 0.6929 - val_auc: 0.8977 - val_loss: 0.2935
Epoch 2/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - auc: 0.8995 - loss: 0.2687 - val_auc: 0.9020 - val_loss: 0.2639
Epoch 3/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - auc: 0.9055 - loss: 0.2435 - val_auc: 0.9039 - val_loss: 0.2564
Epoch 4/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.9089 - loss: 0.2360 - val_auc: 0.9048 - val_loss: 0.2535
Epoch 5/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - auc: 0.9099 - loss: 0.2339 - val_auc: 0.9058 - val_loss: 0.2502
Epoch 6/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - auc: 0.9100 - loss: 0.2326 - val_auc: 0.9053 - val_loss: 0.2505
Epoch 7/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[

In [12]:
# Permutation 중요도 계산
result = permutation_importance(nn, xvalid, yvalid , scoring = 'roc_auc', n_repeats=10, random_state=42)

# 중요도 데이터프레임 생성
importances = result.importances_mean
feature_names = te_dat.columns
perm_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
perm_df.sort_values(by='Importance', ascending=False, inplace=True)

# 상위 feature 추출
top = 5
top_features = perm_df['Feature'].head(top).tolist()


[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m550/550[0m [32m━━━━

In [13]:
print(top_features)

['loan_grade', 'loan_percent_income', 'person_home_ownership', 'person_income', 'loan_int_rate']


In [14]:
# 모델 재학습

# 데이터셋 재 구성
tr_dat_sel = tr_dat[top_features]
te_dat_sel = te_dat[top_features]

# 전처리1: Encoding
n_traindat_sel = cate_dat(tr_dat_sel)
ntestdata_sel = cate_dat(te_dat_sel)

# 전처리2: 입력값 정규화
xtr_dat_sel = StandardScaler().fit_transform(n_traindat_sel)
xtest_sel = StandardScaler().fit_transform(ntestdata_sel)

# 전처리3: split
xtrain_s , xvalid_s , ytrain_s , yvalid_s = train_test_split(xtr_dat_sel , target_values ,train_size=0.7 ,test_size=0.3 , random_state=99)


In [15]:
print(tr_dat_sel)

       loan_grade  loan_percent_income  person_home_ownership  person_income  \
0             1.0                 0.17                    3.0          35000   
1             2.0                 0.07                    2.0          56000   
2             0.0                 0.21                    2.0          28800   
3             1.0                 0.17                    3.0          70000   
4             0.0                 0.10                    3.0          60000   
...           ...                  ...                    ...            ...   
58640         3.0                 0.21                    0.0         120000   
58641         2.0                 0.35                    3.0          28800   
58642         3.0                 0.15                    3.0          44000   
58643         0.0                 0.17                    3.0          30000   
58644         1.0                 0.20                    0.0          75000   

       loan_int_rate  
0              1

In [20]:
# 새로운 모델 학습
# 모델 생성: Regularization, Batchnormalization, EarlyStop
tf.random.set_seed(1234)

# KerasClassifier로 래핑
nn_sel = KerasClassifier(model= create_model, model__input_shape = (5, ))

earlystop1 = callbacks.EarlyStopping(monitor='val_loss', mode='auto')
earlystop2 = callbacks.EarlyStopping(monitor='val_auc' , mode='auto')

# 학습된 모델
train_nn_sel = nn_sel.fit(xtrain_s , ytrain_s , validation_data=(xvalid_s , yvalid_s) ,epochs=10 , callbacks=[earlystop1 , earlystop2])

Epoch 1/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - auc: 0.7116 - loss: 0.5687 - val_auc: 0.9029 - val_loss: 0.2723
Epoch 2/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - auc: 0.9065 - loss: 0.2507 - val_auc: 0.9083 - val_loss: 0.2490
Epoch 3/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - auc: 0.9095 - loss: 0.2372 - val_auc: 0.9096 - val_loss: 0.2432
Epoch 4/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - auc: 0.9102 - loss: 0.2317 - val_auc: 0.9106 - val_loss: 0.2393
Epoch 5/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - auc: 0.9105 - loss: 0.2294 - val_auc: 0.9110 - val_loss: 0.2381
Epoch 6/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - auc: 0.9112 - loss: 0.2277 - val_auc: 0.9119 - val_loss: 0.2374
Epoch 7/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[

In [49]:
# KerasClassifier의 Predict 체크
pred = nn.predict_proba(xtest)
pred = pred[:, 1].reshape(-1, 1)
pred = pred.reshape(pred.shape[1], pred.shape[0]).flatten()
print(pred)
print(pred.shape)

[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[0.99994695 0.02093694 0.63012    ... 0.03794194 0.2843893  0.999583  ]
(39098,)


In [50]:
# Predictions of Neural Networks
# (n, 1)의 데이터 형식을 (1, n)으로 변경
def classifier_predict(neural_net , testdf):
    pred = neural_net.predict_proba(testdf)
    pred = pred[:, 1].reshape(-1, 1)
    predi = (pred.reshape(pred.shape[1] , pred.shape[0])).flatten()
    print(predi.shape)
    return predi

predictnn = classifier_predict(nn , xtest)
predictnn_sel = classifier_predict(nn_sel , xtest_sel)

# 데이터 프레임으로 변환
def create_df(data=[] , col_names=[]):
    sub_df = pd.DataFrame(data).T
    sub_df.columns = col_names
    return sub_df


df_nn = create_df([testid , predictnn] , ['id' , 'loan_status'])
df_nn_sel = create_df([testid , predictnn_sel] , ['id' , 'loan_status'])

[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
(39098,)
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
(39098,)


In [51]:
df_nn_sel.to_csv('/content/drive/MyDrive/Kaggle/per_submission.csv' , index=False)