# 행정구역별로 상권 나누기

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
#환경 설정
plt.rcdefaults()
plt.rcParams["font.family"] = 'Haansoft Dotum'
plt.rcParams['axes.unicode_minus'] = False
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

- df_merge = 최종 통합 데이터
- df_base = 서울시 우리마을가게 상권분석서비스(상권영역)

In [3]:
# 상권 코드 복원을 위해 상권 코드 column 추가
df_merge['상권_코드'] = 0
df_merge.head()

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,총_유동인구_수,아파트_단지_수,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90544.0,14.48,82.273333,294285300.0,24.0,52.0,167195900000.0,15904.0,2.587177,0
1,1,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3026454.0,22.0,89.0,527347100.0,8221.0,242.0,154618100000.0,24375.0,3.026257,0
2,2,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4033679.0,5.0,39.0,112097900.0,1924.0,327.0,259239800000.0,32935.0,2.081362,0
3,3,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3262023.0,8.0,53.0,237967400.0,2427.0,165.0,246639300000.0,6054.0,2.107308,0
4,4,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3764785.0,14.48,82.273333,294285300.0,1104.0,593.0,376621400000.0,104830.0,2.16308,0


In [4]:
# 통합 데이터 상권 코드 : 0 ~ 1495 -> 1496개 상권
df_merge.columns[:1499]

Index(['Unnamed: 0', '기준_년_코드', '기준_분기_코드', '상권코드_0', '상권코드_1', '상권코드_2',
       '상권코드_3', '상권코드_4', '상권코드_5', '상권코드_6',
       ...
       '상권코드_1486', '상권코드_1487', '상권코드_1488', '상권코드_1489', '상권코드_1490',
       '상권코드_1491', '상권코드_1492', '상권코드_1493', '상권코드_1494', '상권코드_1495'],
      dtype='object', length=1499)

### 1. Encoding 값을 다시 상권코드로 변환하기
- KNN 결측치 채우기 작업 파일 조회 결과 : 상권코드_0 = 상권_코드 1000001, 상권코드_1 = 상권_코드 1000002 ..... 상권코드_1495 = 상권_코드 1001496

In [5]:
# 각 row의 상권코드에 맞게 상권코드 column 갱신
for i in range(1496):
    df = df_merge[df_merge['상권코드_'+str(i)] == 1]
    for j in df.index:
        df_merge.loc[j,'상권_코드'] = i + 1000001

In [6]:
# 결과 확인
df_merge.head()

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,총_유동인구_수,아파트_단지_수,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90544.0,14.48,82.273333,294285300.0,24.0,52.0,167195900000.0,15904.0,2.587177,1001496
1,1,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3026454.0,22.0,89.0,527347100.0,8221.0,242.0,154618100000.0,24375.0,3.026257,1001495
2,2,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4033679.0,5.0,39.0,112097900.0,1924.0,327.0,259239800000.0,32935.0,2.081362,1001494
3,3,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3262023.0,8.0,53.0,237967400.0,2427.0,165.0,246639300000.0,6054.0,2.107308,1001493
4,4,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3764785.0,14.48,82.273333,294285300.0,1104.0,593.0,376621400000.0,104830.0,2.16308,1001492


In [7]:
# 결과 확인
c = df_merge[df_merge['상권코드_1495'] == 1]
c

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,총_유동인구_수,아파트_단지_수,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90544.0,14.48,82.273333,294285300.0,24.0,52.0,167195900000.0,15904.0,2.587177,1001496
1475,1475,2020.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97812.0,9.0,98.293333,505200100.0,24.0,52.0,156432700000.0,15904.0,3.638368,1001496
2950,2950,2019.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,130493.0,23.82,69.953333,213890000.0,24.0,52.0,223148500000.0,15904.0,2.031063,1001496
4425,4425,2019.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125094.0,16.92,62.546667,224651200.0,26.0,52.0,178673800000.0,18430.0,1.979045,1001496
5918,5918,2019.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,118702.0,12.206667,86.08,461499300.0,26.0,52.0,173926100000.0,18430.0,3.055229,1001496
7411,7411,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,114633.0,20.733333,76.26,329685100.0,26.0,52.0,168780600000.0,26319.0,2.073733,1001496
8903,8903,2018.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56902.0,36.306667,66.273333,279908700.0,26.0,52.0,174412100000.0,22562.0,3.189066,1001496
10246,10246,2018.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52633.0,18.326667,73.36,360949100.0,24.0,52.0,159820400000.0,22562.0,3.670745,1001496
11589,11589,2018.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51464.0,38.16,66.98,237347400.0,24.0,52.0,150414800000.0,22562.0,4.189044,1001496
12932,12932,2018.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,45386.0,27.36,71.333333,284087000.0,21.0,52.0,147801200000.0,22435.0,3.870293,1001496


In [8]:
df_merge_new = df_merge

## 행정구역별 상권 탐색하기

- df_code = '자치구 코드'와 '자치구 명' 정보 파일

In [10]:
df_code

Unnamed: 0,자치구 코드,자치구 명
0,1100000000,인근지역
1,1111000000,종로구
2,1114000000,중구
3,1117000000,용산구
4,1120000000,성동구
5,1121500000,광진구
6,1123000000,동대문구
7,1126000000,중랑구
8,1129000000,성북구
9,1130500000,강북구


In [11]:
# 서울시 자치구 개수 ('인근지역'이라는 코드 포함)
len(df_code) # 25개

26

In [12]:
# 서울시 우리마을가게 상권분석서비스(상권영역)
df_base.head()

Unnamed: 0,기준_년월_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,시군구_코드,행정동_코드,형태정보
0,201810,R,전통시장,1001453,낙성대시장,196121,442084,11620,11620585,
1,201810,R,전통시장,1001454,봉천제일종합시장,195147,442413,11620,11620595,
2,201810,R,전통시장,1001474,도곡시장,204551,444227,11680,11680650,
3,201810,R,전통시장,1001475,강남개포시장,206065,443310,11680,11680670,
4,201810,R,전통시장,1001412,화곡본동시장,186203,449328,11500,11500590,


In [13]:
# 시군구_코드의 고유값 개수 = 25, 서울시 자치구의 개수 = 25
df_base['시군구_코드'].unique()

array([11620, 11680, 11500, 11545, 11380, 11710, 11590, 11560, 11530,
       11470, 11650, 11110, 11170, 11140, 11305, 11320, 11290, 11200,
       11230, 11215, 11260, 11440, 11740, 11410, 11350], dtype=int64)

- '상권영역' 파일의 시군구_코드는 자치구 코드의 앞 5자리를 가져온 것이라 판단함  
- 실제로 '상권영역' 파일에서 시군구_코드가 11350인 상권의 상권명을 검색해보면 모두 노원구에 위치하고 있고, 노원구의 자치구 코드는 1135000000임

In [14]:
# 자치구 코드를 앞 5자리만 남기기
new_code = [int(i/(10**5)) for i in df_code['자치구 코드']]
df_code['자치구 코드'] = new_code
df_code_new = df_code.drop(0)
df_code_new

Unnamed: 0,자치구 코드,자치구 명
1,11110,종로구
2,11140,중구
3,11170,용산구
4,11200,성동구
5,11215,광진구
6,11230,동대문구
7,11260,중랑구
8,11290,성북구
9,11305,강북구
10,11320,도봉구


In [15]:
# 자치구 코드 : 자치구 명 딕셔너리 생성
code_name_dict = {k:v for k,v in zip(list(df_code_new['자치구 코드']), list(df_code_new['자치구 명']))}
code_name_dict

{11110: '종로구',
 11140: '중구',
 11170: '용산구',
 11200: '성동구',
 11215: '광진구',
 11230: '동대문구',
 11260: '중랑구',
 11290: '성북구',
 11305: '강북구',
 11320: '도봉구',
 11350: '노원구',
 11380: '은평구',
 11410: '서대문구',
 11440: '마포구',
 11470: '양천구',
 11500: '강서구',
 11530: '구로구',
 11545: '금천구',
 11560: '영등포구',
 11590: '동작구',
 11620: '관악구',
 11650: '서초구',
 11680: '강남구',
 11710: '송파구',
 11740: '강동구'}

In [16]:
# 자치구별 상권코드 분류
code_list = []
for i in df_base['시군구_코드'].unique():
    df = df_base[df_base['시군구_코드'] == i]
    code = list(df['상권_코드'])
    code_list.append(code)
df_base['시군구_코드'].unique()

array([11620, 11680, 11500, 11545, 11380, 11710, 11590, 11560, 11530,
       11470, 11650, 11110, 11170, 11140, 11305, 11320, 11290, 11200,
       11230, 11215, 11260, 11440, 11740, 11410, 11350], dtype=int64)

In [17]:
# 모든 상권이 있는지 확인
count = 0
for code in code_list:
    count += len(code)
count

1496

## 자치구별로 상권코드 분류하기

In [18]:
# 상권코드를 복원한 통합 데이터
df_merge_new.head()

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,총_유동인구_수,아파트_단지_수,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90544.0,14.48,82.273333,294285300.0,24.0,52.0,167195900000.0,15904.0,2.587177,1001496
1,1,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3026454.0,22.0,89.0,527347100.0,8221.0,242.0,154618100000.0,24375.0,3.026257,1001495
2,2,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4033679.0,5.0,39.0,112097900.0,1924.0,327.0,259239800000.0,32935.0,2.081362,1001494
3,3,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3262023.0,8.0,53.0,237967400.0,2427.0,165.0,246639300000.0,6054.0,2.107308,1001493
4,4,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3764785.0,14.48,82.273333,294285300.0,1104.0,593.0,376621400000.0,104830.0,2.16308,1001492


In [19]:
import warnings
warnings.filterwarnings(action='ignore')
# 시군구 코드 및 자치구 명 column 생성
df_merge_new['시군구_코드'] = 0
df_merge_new['자치구_명'] = 0
df_merge_new.head()

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드,시군구_코드,자치구_명
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,294285300.0,24.0,52.0,167195900000.0,15904.0,2.587177,1001496,0,0
1,1,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,89.0,527347100.0,8221.0,242.0,154618100000.0,24375.0,3.026257,1001495,0,0
2,2,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,112097900.0,1924.0,327.0,259239800000.0,32935.0,2.081362,1001494,0,0
3,3,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.0,237967400.0,2427.0,165.0,246639300000.0,6054.0,2.107308,1001493,0,0
4,4,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,294285300.0,1104.0,593.0,376621400000.0,104830.0,2.16308,1001492,0,0


In [20]:
# 시군구_코드, 자치구_명 채우기
for i in range(df_merge_new.shape[0]):
    c = df_merge_new.loc[i,'상권_코드']
    for j in range(len(code_list)):
        if c in code_list[j]:
            code = list(df_base['시군구_코드'].unique())[j]
            name = code_name_dict[code]
            df_merge_new.loc[i,'시군구_코드'] = code
            df_merge_new.loc[i, '자치구_명'] = name
        else: continue

In [21]:
df_merge_new

Unnamed: 0.1,Unnamed: 0,기준_년_코드,기준_분기_코드,상권코드_0,상권코드_1,상권코드_2,상권코드_3,상권코드_4,상권코드_5,상권코드_6,...,아파트_평균_면적,아파트_평균_시가,총 상주인구 수,집객시설_수,당월_매출_금액,총_직장_인구_수,폐업률,상권_코드,시군구_코드,자치구_명
0,0,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,2.942853e+08,24.0,52.000000,1.671959e+11,15904.000000,2.587177,1001496,11680,강남구
1,1,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,89.000000,5.273471e+08,8221.0,242.000000,1.546181e+11,24375.000000,3.026257,1001495,11710,송파구
2,2,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.000000,1.120979e+08,1924.0,327.000000,2.592398e+11,32935.000000,2.081362,1001494,11110,종로구
3,3,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.000000,2.379674e+08,2427.0,165.000000,2.466393e+11,6054.000000,2.107308,1001493,11140,중구
4,4,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,2.942853e+08,1104.0,593.000000,3.766214e+11,104830.000000,2.163080,1001492,11140,중구
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32907,32907,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,49.000000,7.108571e+07,108.0,37.766667,8.661690e+08,488.026667,5.555556,1001443,11590,동작구
32908,32908,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,2.942853e+08,15.0,2.000000,2.782282e+09,435.340000,0.000000,1001456,11620,관악구
32909,32909,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,2.942853e+08,162.0,2.000000,4.617167e+09,440.340000,1.492537,1001464,11620,관악구
32910,32910,2020.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.273333,2.942853e+08,52.0,2.000000,2.625743e+09,438.653333,0.000000,1001465,11620,관악구


In [22]:
len(df_merge_new['시군구_코드'].unique())

25

In [23]:
len(df_merge_new['자치구_명'].unique())

25

In [27]:
# 결측치 확인
df_merge_new.isnull().sum().sum()

0