In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
%pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 전처리를 위한 라이브러리
from sklearn.preprocessing import OrdinalEncoder , StandardScaler
from sklearn.model_selection import train_test_split

# 모델 제작을 위한 라이브러리
import tensorflow as tf
from tensorflow.keras import layers , regularizers, callbacks
from sklearn.inspection import permutation_importance

In [3]:
# 데이터 입력
tr_dat = pd.read_csv('/content/drive/MyDrive/Kaggle/data/train.csv')
te_dat = pd.read_csv('/content/drive/MyDrive/Kaggle/data/test.csv')
testid = te_dat['id']

# 추가 전처리: 데이터 column 수정.
tr_dat['cb_person_cred_hist_length'] = tr_dat['cb_person_cred_hist_length'].replace(0, np.nan)  # 분모의 0을 NaN으로 대체
tr_dat['emp_per_cred'] = tr_dat['person_emp_length'] / tr_dat['cb_person_cred_hist_length']
te_dat['cb_person_cred_hist_length'] = te_dat['cb_person_cred_hist_length'].replace(0, np.nan)  # 분모의 0을 NaN으로 대체
te_dat['emp_per_cred'] = te_dat['person_emp_length'] / te_dat['cb_person_cred_hist_length']

tr_dat = tr_dat.drop('id', axis = 1)
target_values = tr_dat['loan_status']
tr_dat = tr_dat.drop('loan_status', axis = 1)
te_dat = te_dat.drop('id', axis = 1)

# 전처리1: Encoding
def cate_dat(df , encoder=OrdinalEncoder()):
    cols = df.select_dtypes(exclude=[np.number])
    for i in cols:
        df[i] = encoder.fit_transform(df[[i]])
    return df
n_traindat = cate_dat(tr_dat)
ntestdata = cate_dat(te_dat)

# 전처리2: 입력값 정규화
xtr_dat = StandardScaler().fit_transform(n_traindat)
xtest = StandardScaler().fit_transform(ntestdata)


# 전처리3: split
xtrain , xvalid , ytrain , yvalid = train_test_split(xtr_dat , target_values ,train_size=0.7 ,test_size=0.3 , random_state=99)

In [4]:
# 모델 생성: Regularization, Batchnormalization, EarlyStop
tf.random.set_seed(1234)

nn = tf.keras.Sequential([
    tf.keras.Input(shape=(12,)),                  ## Input layer with no, of features as units(neurons)
    layers.Dense(12  , activation='relu' , kernel_regularizer=regularizers.l1(0.01)),   ## 1 Hidden Layer
    layers.BatchNormalization(), ## Not a layer just to normalize the neural network
    layers.Dense(1 , activation='sigmoid')   ## Output layer
])
nn.summary()

nn.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False) ,  metrics=['auc'])

earlystop1 = callbacks.EarlyStopping(monitor='val_loss', mode='auto')
earlystop2 = callbacks.EarlyStopping(monitor='val_auc' , mode='auto')

# 학습된 모델
train_nn = nn.fit(xtrain , ytrain , validation_data=(xvalid , yvalid) ,epochs=10 , callbacks=[earlystop1 , earlystop2])

Epoch 1/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - auc: 0.7092 - loss: 0.6889 - val_auc: 0.9017 - val_loss: 0.2983
Epoch 2/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - auc: 0.9078 - loss: 0.2648 - val_auc: 0.9050 - val_loss: 0.2641
Epoch 3/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - auc: 0.9106 - loss: 0.2431 - val_auc: 0.9073 - val_loss: 0.2562
Epoch 4/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - auc: 0.9111 - loss: 0.2372 - val_auc: 0.9087 - val_loss: 0.2527
Epoch 5/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - auc: 0.9118 - loss: 0.2347 - val_auc: 0.9088 - val_loss: 0.2510
Epoch 6/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - auc: 0.9121 - loss: 0.2336 - val_auc: 0.9096 - val_loss: 0.2496
Epoch 7/10
[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[

In [9]:
# Permutation 중요도 계산
result = permutation_importance(nn, xvalid, yvalid , scoring = 'roc_auc', n_repeats=10, random_state=42)

# 중요도 데이터프레임 생성
importances = result.importances_mean
feature_names = te_dat.columns
perm_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
perm_df.sort_values(by='Importance', ascending=False, inplace=True)

# 상위 feature 추출
top = 5
top_features = perm_df['Feature'].head(top).tolist()


AttributeError: Sequential has none of the following attributes: decision_function, predict_proba.

In [None]:
# 모델 재학습

# 데이터셋 재 구성
tr_dat_sel = tr_dat[top_features]
te_dat_sel = te_dat[top_features]

# 전처리1: Encoding
n_traindat_sel = cate_dat(tr_dat_sel)
ntestdata_sel = cate_dat(te_dat_sel)

# 전처리2: 입력값 정규화
xtr_dat_sel = StandardScaler().fit_transform(n_traindat_sel)
xtest_sel = StandardScaler().fit_transform(ntestdata_sel)

# 전처리3: split
xtrain_s , xvalid_s , ytrain_s , yvalid_s = train_test_split(xtr_dat_sel , target_values ,train_size=0.7 ,test_size=0.3 , random_state=99)


In [None]:
# 새로운 모델 학습
# 모델 생성: Regularization, Batchnormalization, EarlyStop
tf.random.set_seed(1234)

nn_sel = tf.keras.Sequential([
    tf.keras.Input(shape=(12,)),                  ## Input layer with no, of features as units(neurons)
    layers.Dense(12  , activation='relu' , kernel_regularizer=regularizers.l1(0.01)),   ## 1 Hidden Layer
    layers.BatchNormalization(), ## Not a layer just to normalize the neural network
    layers.Dense(1 , activation='sigmoid')   ## Output layer
])
nn_sel.summary()

nn_sel.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False) ,  metrics=['auc'])

earlystop1 = callbacks.EarlyStopping(monitor='val_loss', mode='auto')
earlystop2 = callbacks.EarlyStopping(monitor='val_auc' , mode='auto')

# 학습된 모델
train_nn_sel = nn.fit(xtrain_s , ytrain_s , validation_data=(xvalid_s , yvalid_s) ,epochs=10 , callbacks=[earlystop1 , earlystop2])


In [None]:
# Predictions of Neural Networks
# (n, 1)의 데이터 형식을 (1, n)으로 변경
def test_predictions(neural_net , testdf):
    neural_predicts = neural_net.predict(x=testdf)
    predi = (neural_predicts.reshape(neural_predicts.shape[1] , neural_predicts.shape[0])).flatten()
    print(predi.shape)
    return predi

predictnn1 = test_predictions(nn , xtest)

# 데이터 프레임으로 변환
def create_df(data=[] , col_names=[]):
    sub_df = pd.DataFrame(data).T
    sub_df.columns = col_names
    return sub_df


df_nn1 = create_df([testid , predictnn1] , ['id' , 'loan_status'])

[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
(39098,)


In [None]:
df_nn1.to_csv('/content/drive/MyDrive/Kaggle/del_id_submission.csv' , index=False)