# 라이브러리

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

# combine 데이터

## 데이터 전처리

In [3]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_combined.csv')

In [4]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [5]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [6]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [7]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [8]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [9]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [10]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5821
PR AUC: 0.049491021848357145


### 앙상블

In [14]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = LogisticRegression(max_iter=1000)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.7017846774664214
Final Train PR AUC: 0.08000638016326371
Final Test ROC AUC: 0.5835384065873066
Final Test PR AUC: 0.049542925171336336


# total 데이터

## 데이터 전처리

In [15]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_total.csv')

In [16]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [17]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [19]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [20]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [21]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [22]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [23]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5820
PR AUC: 0.04951732896922671


### 앙상블

In [25]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = LogisticRegression(max_iter=1000)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.7017903443097664
Final Train PR AUC: 0.08000557585005223
Final Test ROC AUC: 0.5832600210766693
Final Test PR AUC: 0.04948899378056962


# preindex 데이터

## 데이터 전처리

In [26]:
data = pd.read_csv('/content/drive/Shareddrives/서울시 빅데이터(2022-2)/논문/3. 폐업 여부 분류 모델/0) 데이터/data_PredIndex.csv')

In [27]:
data['폐업여부']=data['폐업여부'].apply(lambda x: 1 if x=="Y" else 0)
data['프랜차이즈여부']=data['프랜차이즈여부'].apply(lambda x: 1 if x=="Y" else 0)

In [28]:
data['표준산업분류코드']=data['표준산업분류코드'].astype('category')
data['행정동코드']=data['행정동코드'].astype('category')
data['분기']=data['분기'].astype('category')

In [29]:
data_encoded = pd.get_dummies(data, columns=['표준산업분류코드', '행정동코드', '분기'])
data_encoded = data_encoded.drop(['점포명'], axis=1)

In [30]:
train_data = data_encoded[data_encoded['년도'] == 2021]
test_data = data_encoded[data_encoded['년도'] == 2022]

In [31]:
train_data = train_data.drop(['년도'], axis=1)
test_data = test_data.drop(['년도'], axis=1)

In [32]:
x_train = train_data.drop(['폐업여부'], axis=1)
y_train = train_data['폐업여부']
x_test = test_data.drop(['폐업여부'], axis=1)
y_test = test_data['폐업여부']

## 모델링

### 기본

In [33]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
ens_score = model.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, ens_score)
print('ROC AUC 값: {0:.4f}'.format(roc_score))

precision, recall, _ = precision_recall_curve(y_test, ens_score)
pr_auc_score = auc(recall, precision)
print("PR AUC:", pr_auc_score)

ROC AUC 값: 0.5851
PR AUC: 0.049742648144799714


### 앙상블

In [35]:
roc_auc_scores = []
pr_auc_scores = []
final_y_train_pred_proba = []
final_y_test_pred_proba = []

num_iterations = 20

for iteration in range(num_iterations):

    non_closed_data = train_data[train_data['폐업여부'] == 0].sample(n=5000, random_state= iteration+426)
    closed_data = train_data[train_data['폐업여부'] == 1]

    new_train_data = pd.concat([non_closed_data, closed_data], axis=0)

    new_x_train = new_train_data.drop(['폐업여부'], axis=1)
    new_y_train = new_train_data['폐업여부']

    model = LogisticRegression(max_iter=1000)

    model.fit(new_x_train, new_y_train)

    final_y_train_pred_proba.append(model.predict_proba(x_train)[:, 1])
    final_y_test_pred_proba.append(model.predict_proba(x_test)[:, 1])

final_y_train_pred_proba = np.mean(final_y_train_pred_proba, axis=0)
final_y_test_pred_proba = np.mean(final_y_test_pred_proba, axis=0)

final_roc_auc_test = roc_auc_score(y_train,final_y_train_pred_proba)
print("Final Train ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_train,final_y_train_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Train PR AUC:", final_pr_auc_test)

final_roc_auc_test = roc_auc_score(y_test, final_y_test_pred_proba)
print("Final Test ROC AUC:", final_roc_auc_test)
precision, recall, _ = precision_recall_curve(y_test, final_y_test_pred_proba)
final_pr_auc_test = auc(recall, precision)
print("Final Test PR AUC:", final_pr_auc_test)

Final Train ROC AUC: 0.700134337591825
Final Train PR AUC: 0.07970866907749101
Final Test ROC AUC: 0.5864943119818418
Final Test PR AUC: 0.04987625838318083
