In [None]:
import pandas as pd

In [None]:
# 파일 불러오기
df = pd.read_csv('dataset_final.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
print('dataset shape: ', df.shape)
print(df.info())

dataset shape:  (1241, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241 entries, 0 to 1240
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   행정동_코드             1241 non-null   int64  
 1   행정동명               1241 non-null   object 
 2   상권_구분_코드           1241 non-null   object 
 3   상권_코드              1241 non-null   int64  
 4   상권_코드_명            1241 non-null   object 
 5   분기당_매출_금액          1241 non-null   int64  
 6   분기당_매출_건수          1241 non-null   int64  
 7   건당_매출액_평균          1241 non-null   int64  
 8   환산임대료(원, 21년도)     1241 non-null   int64  
 9   증감율(%, 19-21년도)    1241 non-null   float64
 10  임대료/매출             1241 non-null   float64
 11  매출/임대료             1241 non-null   float64
 12  점포수                1241 non-null   int64  
 13  면적                 1241 non-null   float64
 14  단위면적당 점포 수(k㎡)     1241 non-null   float64
 15  프랜차이즈_점포_수         1241 non-null   int64  
 1

In [None]:
print(df['활성화_여부'].value_counts())

2    841
1    300
0    100
Name: 활성화_여부, dtype: int64


식별자 컬럼 제거

In [None]:
df.drop(['행정동_코드', '행정동명', '상권_구분_코드', '상권_코드', '상권_코드_명'], axis=1, inplace=True)

분석에 사용할 속성 선택

In [None]:
columns = ['분기당_매출_금액', '분기당_매출_건수', '환산임대료(원, 21년도)', '점포수', '단위면적당 점포 수(k㎡)', '프랜차이즈_점포_수', '개인사업자_수', '2021년_2030여성_비율평균', 
           '2021년_외국인_평균', '문화시설수', '지하철_역_수', '버스_정거장_수', '전체_카페_점포_수', '개인카페_점포수']

클래스 데이터셋과 피처 데이터셋 분리

In [None]:
X_features = df.iloc[:,:-1]
y_labels = df.iloc[:,-1]
print('피처 데이터 shape:{0}'.format(X_features.shape))

피처 데이터 shape:(1241, 21)


피처 변수 전처리(정규화)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_features)
X_scaled = scaler.transform(X_features)

In [None]:
X_features = pd.DataFrame(data=X_scaled, columns=X_features.columns)

학습 데이터셋과 테스트 데이터셋 분리

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                   test_size=0.3, random_state=101)

train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape, X_test.shape))

print(' 학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt)

학습 세트 Shape:(868, 21), 테스트 세트 Shape:(373, 21)
 학습 세트 레이블 값 분포 비율
2    0.677419
1    0.247696
0    0.074885
Name: 활성화_여부, dtype: float64

 테스트 세트 레이블 값 분포 비율
2    0.678284
1    0.227882
0    0.093834
Name: 활성화_여부, dtype: float64


## SVM

SVM 모형 학습

In [None]:
from sklearn import svm
from sklearn.metrics import f1_score

In [None]:
clf = svm.SVC(decision_function_shape='ovo')

SVC(decision_function_shape='ovo')

In [None]:
#SVM 모형
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.6890080428954424


# LightGBM

LightGBM 모델 학습 및 f1-score 확인

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=300, objective='multiclass')

In [None]:
evals = [(X_test,y_test)]

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, 
             eval_set=evals, verbose=True)

[1]	valid_0's multi_logloss: 0.799956
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's multi_logloss: 0.782859
[3]	valid_0's multi_logloss: 0.767243
[4]	valid_0's multi_logloss: 0.756791
[5]	valid_0's multi_logloss: 0.745836
[6]	valid_0's multi_logloss: 0.738513
[7]	valid_0's multi_logloss: 0.733716
[8]	valid_0's multi_logloss: 0.727355
[9]	valid_0's multi_logloss: 0.723117
[10]	valid_0's multi_logloss: 0.717622
[11]	valid_0's multi_logloss: 0.713845
[12]	valid_0's multi_logloss: 0.710566
[13]	valid_0's multi_logloss: 0.710288
[14]	valid_0's multi_logloss: 0.707629
[15]	valid_0's multi_logloss: 0.705356
[16]	valid_0's multi_logloss: 0.705136
[17]	valid_0's multi_logloss: 0.705464
[18]	valid_0's multi_logloss: 0.708216
[19]	valid_0's multi_logloss: 0.708929
[20]	valid_0's multi_logloss: 0.70612
[21]	valid_0's multi_logloss: 0.706234
[22]	valid_0's multi_logloss: 0.705568
[23]	valid_0's multi_logloss: 0.70536
[24]	valid_0's multi_logloss: 0.706971
[25]	valid_0

LGBMClassifier(n_estimators=300, objective='multiclass')

In [None]:
#LightGBM 모델
pred = lgbm_clf.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.6916890080428955


하이퍼 파라미터 튜닝

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=300, objective='multiclass')

In [None]:
params = {'num_leaves': [32, 64, 128], 
          'max_depth': [32, 64, 128],
          'min_child_samples': [60, 80, 100],
          'subsample': [0.5, 0.7, 0.9]}


In [None]:
gridcv = GridSearchCV(lgbm_clf, param_grid=params)
gridcv.fit(X_train, y_train, early_stopping_rounds=30,
           eval_set=[(X_train, y_train), (X_test, y_test)])

print('GridSearchCV 최적 파라미터:', gridcv.best_params_)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[31]	valid_0's multi_logloss: 0.58892	valid_1's multi_logloss: 0.720849
[32]	valid_0's multi_logloss: 0.58625	valid_1's multi_logloss: 0.720667
[33]	valid_0's multi_logloss: 0.583159	valid_1's multi_logloss: 0.721271
[34]	valid_0's multi_logloss: 0.57973	valid_1's multi_logloss: 0.721052
[35]	valid_0's multi_logloss: 0.577494	valid_1's multi_logloss: 0.721903
[36]	valid_0's multi_logloss: 0.573873	valid_1's multi_logloss: 0.722288
[37]	valid_0's multi_logloss: 0.5714	valid_1's multi_logloss: 0.72213
[38]	valid_0's multi_logloss: 0.56854	valid_1's multi_logloss: 0.723338
[39]	valid_0's multi_logloss: 0.565879	valid_1's multi_logloss: 0.723207
[40]	valid_0's multi_logloss: 0.562126	valid_1's multi_logloss: 0.722624
[41]	valid_0's multi_logloss: 0.560006	valid_1's multi_logloss: 0.723687
[42]	valid_0's multi_logloss: 0.557415	valid_1's multi_logloss: 0.723401
[43]	valid_0's multi_logloss: 0.553951	valid_1's multi_logloss: 0.723561
[44]	val

GridSearchCV 최적 파라미터: {'max_depth': 32, 'min_child_samples': 80, 'num_leaves': 32, 'subsample': 0.5}

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=100, objective='multiclass', max_depth=32, min_child_samples=80, num_leaves=32, subsample=0.5)

In [None]:
evals = [(X_test, y_test)]

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=50, eval_set=evals, verbose=True)

[1]	valid_0's multi_logloss: 0.804943
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 0.790235
[3]	valid_0's multi_logloss: 0.778897
[4]	valid_0's multi_logloss: 0.769507
[5]	valid_0's multi_logloss: 0.760656
[6]	valid_0's multi_logloss: 0.754734
[7]	valid_0's multi_logloss: 0.747923
[8]	valid_0's multi_logloss: 0.742335
[9]	valid_0's multi_logloss: 0.737157
[10]	valid_0's multi_logloss: 0.734347
[11]	valid_0's multi_logloss: 0.729229
[12]	valid_0's multi_logloss: 0.725986
[13]	valid_0's multi_logloss: 0.724901
[14]	valid_0's multi_logloss: 0.72104
[15]	valid_0's multi_logloss: 0.717663
[16]	valid_0's multi_logloss: 0.716503
[17]	valid_0's multi_logloss: 0.713752
[18]	valid_0's multi_logloss: 0.710967
[19]	valid_0's multi_logloss: 0.709508
[20]	valid_0's multi_logloss: 0.708637
[21]	valid_0's multi_logloss: 0.70697
[22]	valid_0's multi_logloss: 0.706333
[23]	valid_0's multi_logloss: 0.705291
[24]	valid_0's multi_logloss: 0.704678
[25]	valid_0'

LGBMClassifier(max_depth=32, min_child_samples=80, num_leaves=32,
               objective='multiclass', subsample=0.5)

In [None]:
pred = lgbm_clf.predict(X_test)

검증

In [None]:
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.7050938337801609


In [None]:
confusion_matrix(y_test, pred)

array([[ 14,   4,  17],
       [  5,   5,  75],
       [  4,   5, 244]])