# 라이브러리

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

# combine 데이터

## 데이터 전처리

In [3]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_combined.csv')

In [4]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [5]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [6]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [7]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [8]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [9]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [10]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=6, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5604
PR AUC: 0.0464846852231148


In [11]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=7, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5595
PR AUC: 0.04574376784248774


In [12]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5550
PR AUC: 0.045917533066677674


### 앙상블

In [13]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=6, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8887450657303722
Final Train PR AUC: 0.26871010022043734
Final Test ROC AUC: 0.5725524612124587
Final Test PR AUC: 0.049374459484651426


In [14]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.9466324133227082
Final Train PR AUC: 0.405864742628228
Final Test ROC AUC: 0.5718811087714647
Final Test PR AUC: 0.04880009114084376


# total 데이터

## 데이터 전처리

In [15]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_total.csv')

In [16]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [17]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [18]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [19]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [20]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [21]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [22]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=6, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5541
PR AUC: 0.04528122599619055


In [23]:
model = XGBClassifier(random_state=42,learning_rate=0.01, max_depth=6, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5434
PR AUC: 0.044173557342417576


In [24]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=10, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5537
PR AUC: 0.04468612165126721


In [25]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5543
PR AUC: 0.04543880047229901


In [26]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=500)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5481
PR AUC: 0.04442475359526565


In [27]:
model = XGBClassifier(random_state=42,learning_rate=0.01, max_depth=8, n_estimators=500)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5479
PR AUC: 0.04428767516476433


In [28]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=7, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5553
PR AUC: 0.04614439641860431


In [29]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=7, n_estimators=200)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5564
PR AUC: 0.04558187412378902


### 앙상블

In [30]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.9464465885817807
Final Train PR AUC: 0.4090494823023873
Final Test ROC AUC: 0.5706011411617131
Final Test PR AUC: 0.04872069701028375


In [31]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=6, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8867603463078356
Final Train PR AUC: 0.26626085708222536
Final Test ROC AUC: 0.5689745024018067
Final Test PR AUC: 0.048724868962774906


In [32]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=7, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.9224059422113688
Final Train PR AUC: 0.33669624543304816
Final Test ROC AUC: 0.573247483293688
Final Test PR AUC: 0.04949045672602406


In [33]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=7, n_estimators=200)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.9571463225240822
Final Train PR AUC: 0.45861172291406416
Final Test ROC AUC: 0.5689019664075375
Final Test PR AUC: 0.04862856947380083


# preindex 데이터

## 데이터 전처리

In [34]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_PredIndex.csv')

In [35]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [36]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [37]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [38]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [39]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [40]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [41]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=6, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5784
PR AUC: 0.049196604241728954


In [42]:
model = XGBClassifier(random_state=42,learning_rate=0.01, max_depth=6, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5364
PR AUC: 0.047777368960612174


In [43]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=7, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5749
PR AUC: 0.0489545949400127


In [44]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5858
PR AUC: 0.0501786789909575


In [45]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=50)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5799
PR AUC: 0.04973121924152203


In [46]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=8, n_estimators=200)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5823
PR AUC: 0.0489400239290914


In [47]:
model = XGBClassifier(random_state=42,learning_rate=0.1, max_depth=9, n_estimators=100)
model.fit(x_train, y_train)

ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5725
PR AUC: 0.04795935825282904


### 앙상블

In [48]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=4000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8184082419237699
Final Train PR AUC: 0.17939479408554027
Final Test ROC AUC: 0.5835416643442424
Final Test PR AUC: 0.050235837125470945


In [49]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 30

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8206756771782355
Final Train PR AUC: 0.18794474976036102
Final Test ROC AUC: 0.5826620699716305
Final Test PR AUC: 0.05077781928640529


In [50]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 40

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8217850005361431
Final Train PR AUC: 0.18972599858046593
Final Test ROC AUC: 0.5833918075252047
Final Test PR AUC: 0.05098797503458252


In [51]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8197238145190396
Final Train PR AUC: 0.18702282631684902
Final Test ROC AUC: 0.5801780303082362
Final Test PR AUC: 0.050253261938743236


In [52]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=6, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.7779295683192009
Final Train PR AUC: 0.14797310539447905
Final Test ROC AUC: 0.5838659638666908
Final Test PR AUC: 0.05055812961299175


In [53]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+20)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=iteration+42, learning_rate=0.1, max_depth=8, n_estimators=200)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8518852800415591
Final Train PR AUC: 0.21708457683026375
Final Test ROC AUC: 0.5804374292042334
Final Test PR AUC: 0.050121927524910855


In [56]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=1770, random_state= iteration+42)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=42, learning_rate=0.1, max_depth=8, n_estimators=100)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.8086165033080764
Final Train PR AUC: 0.15187630657970042
Final Test ROC AUC: 0.596274480070901
Final Test PR AUC: 0.05323928929455567


In [57]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=1770, random_state= iteration+42)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = XGBClassifier(random_state=123, learning_rate=0.1, max_depth=8, n_estimators=50)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.7859837274982774
Final Train PR AUC: 0.134173100713537
Final Test ROC AUC: 0.596537722102033
Final Test PR AUC: 0.05280685065927382
