# build Gu-Dong map
- 기존 match 파일에 동이 다 담겨있지 않아 https://www.seoul.go.kr/seoul/autonomy_sub.do 에서 새롭게 맵핑 파일 생성
- 크롤링

In [383]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [384]:
url = "https://www.seoul.go.kr/seoul/autonomy_sub.do"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [385]:
table = soup.find_all("div", class_="district tabcont")

In [386]:
pattern_gu = r"[가-힣]+\s*구"
pattern_dong = r"[가-힣|0-9]+[동|가]"

def get_gu(data):
    global pattern_gu
    data = str(data.find("h5"))
    data = re.findall(pattern_gu, data)
    if len(data) != 1:
        return
    return data[0]

def get_dongs(data):
    global pattern_dong
    rst = list()
    data = data.find_all("td")
    for line in data:
        rst += re.findall(pattern_dong, str(line))
    return rst

In [387]:
results = dict()
for data in table:
    ext_gu = get_gu(data)
    if not ext_gu:
        continue
    tmp_dongs = list()
    ext_dongs = get_dongs(data)    
    if ext_dongs:
        tmp_dongs += ext_dongs
    results[ext_gu] = tmp_dongs

In [388]:
results["중구"] = results["중 구"]
del results["중 구"]

In [389]:
import json

# 1차 저장
with open("dataset/map_gu_dong.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


# match Gu-Dong in train, testset

## check match error cases and fix

In [390]:
import json
import pandas as pd

In [391]:
# load map
path_map = "dataset/map_gu_dong.json"
with open(path_map, "r", encoding="utf-8") as f:
    map = json.load(f)

In [392]:
match_dong = dict()
for gu in map:
    for dong in map[gu]:
        match_dong[dong] = gu

In [393]:
print(len(match_dong))
match_dong["사당5동"]

798


'동작구'

In [394]:
# load dataset
path_train = "dataset/train.csv"
path_test = "dataset/test.csv"

In [395]:
df = pd.read_csv(path_train)
df = df.loc[df["city"] == "서울특별시"] # get in Seoul cases
print(df.shape)
df.tail(2)

(742285, 13)


Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor,transaction_real_price
1184017,1196848,680,서울특별시,강일동,665,강일리버파크1단지,강일동 665 강일리버파크1단지,84.74,2009,201711,21~30,9,55500
1184018,1196849,996,서울특별시,강일동,717,고덕리엔파크2단지,강일동 717 고덕리엔파크2단지,84.98,2011,201711,21~30,8,63750


In [396]:
errors = list()
dongs = df["dong"]
for dong in dongs:
    if dong not in match_dong:
        errors.append(dong)
print(len(set(errors))) # 2개 케이스에 대해서는 구 매치 데이터가 없음 -> manually add
print(set(errors))

2
{'신문로1가', '인현동2가'}


In [397]:
# fix & save
map["중구"].append("인현동2가")
map["종로구"].append("신문로1가")

In [400]:
with open("dataset/map_gu_dong.json", "w", encoding="utf-8") as f:
    json.dump(map, f, ensure_ascii=False, indent=4) # gu-dong map file에 저장

In [401]:
match_dong["인현동2가"] = "중구" # 직접 확인 후 데이터에 추가
match_dong["신문로1가"] = "종로구"

In [402]:
df = pd.read_csv(path_test)
df = df.loc[df["city"] == "서울특별시"] # get in Seoul cases
print(df.shape)
df.tail(2)

(3911, 12)


Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor
5460,1157024,5550,서울특별시,구로동,256-1,삼성래미안,구로동 256-1 삼성래미안,110.188,2004,201712,21~31,23
5461,1136863,3371,서울특별시,홍제동,24-16,동양드림,홍제동 24-16 동양드림,92.44,2005,201710,21~31,1


In [403]:
errors = list()
dongs = df["dong"]
for dong in dongs:
    if dong not in match_dong:
        errors.append(dong)
print(len(set(errors))) # testset에서는 매치 안되는 동 없음
print(set(errors))

0
set()


## join table & save seoul cases

In [404]:
import json
import pandas as pd

In [405]:
# load map
path_map = "dataset/map_gu_dong.json"
with open(path_map, "r", encoding="utf-8") as f:
    map = json.load(f)

In [406]:
match_dong = dict()
for gu in map:
    for dong in map[gu]:
        match_dong[dong] = gu

In [407]:
# load dataset
path_train = "dataset/train.csv"
path_test = "dataset/test.csv"

In [410]:
df_train = pd.read_csv(path_train)
df_train = df_train.loc[df_train["city"] == "서울특별시"] # get in Seoul cases
print(df_train.shape)
df_train.tail(2)

(742285, 13)


Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor,transaction_real_price
1184017,1196848,680,서울특별시,강일동,665,강일리버파크1단지,강일동 665 강일리버파크1단지,84.74,2009,201711,21~30,9,55500
1184018,1196849,996,서울특별시,강일동,717,고덕리엔파크2단지,강일동 717 고덕리엔파크2단지,84.98,2011,201711,21~30,8,63750


In [412]:
df_train["gu"] = df_train["dong"].map(match_dong) # {dong: gu} match dict

In [413]:
df_test = pd.read_csv(path_test)
df_test = df_test.loc[df_test["city"] == "서울특별시"]
print(df_test.shape)

(3911, 12)


In [414]:
df_test["gu"] = df_test["dong"].map(match_dong) # {dong: gu} match dict

In [415]:
df_train.to_csv("dataset/train_seoul.csv", index=False)

In [416]:
df_test.to_csv("dataset/test_seoul.csv", index=False)

# park
- 기존 park.csv 데이터에는 마포구, 성북구가 없음
- 서울시 주요 공원: https://parks.seoul.go.kr/story/data/detailView.do?searchTp=&searchWd=&currentPage=1&bIdx=514
  - 1편2.구별현황 1.공원종류별 현황 tab
  - "dataset/park_seoul.csv"

In [329]:
import pandas as pd
import json

In [422]:
df = pd.read_csv("dataset/park_seoul_raw.csv", names=["gu", "num"])

In [423]:
dic_park = dict()
for idx in range(len(df)):
    data = df.iloc[idx]
    gu, num = data["gu"], data["num"]
    dic_park[gu] = int(num)

In [427]:
print(dic_park) # 동대문, 서대문, 영등포 뒤에 "구" 추가, 모든 구 이름에서 띄어쓰기 제거, 중 구 -> 중구

{' 종로구 ': 106, ' 중  구 ': 72, ' 용산구 ': 107, ' 성동구 ': 100, ' 광진구 ': 68, ' 동대문 ': 92, ' 중랑구 ': 113, ' 성북구 ': 122, ' 강북구 ': 87, ' 도봉구 ': 80, ' 노원구 ': 187, ' 은평구 ': 136, ' 서대문 ': 123, ' 마포구 ': 144, ' 양천구 ': 123, ' 강서구 ': 173, ' 구로구 ': 114, ' 금천구 ': 55, ' 영등포 ': 108, ' 동작구 ': 86, ' 관악구 ': 135, ' 서초구 ': 179, ' 강남구 ': 165, ' 송파구 ': 174, ' 강동구 ': 129}


In [436]:
target = {"영등포", "서대문", "동대문"}
dic_park = dict()
for idx in range(len(df)):
    data = df.iloc[idx]
    gu, num = data["gu"], data["num"]
    gu = re.sub(r"\s+", "", gu) # 텍스트 내 모든 space 제거
    if gu in target:
        gu += "구"
    dic_park[gu] = num.item()

In [438]:
print(dic_park)

{'종로구': 106, '중구': 72, '용산구': 107, '성동구': 100, '광진구': 68, '동대문구': 92, '중랑구': 113, '성북구': 122, '강북구': 87, '도봉구': 80, '노원구': 187, '은평구': 136, '서대문구': 123, '마포구': 144, '양천구': 123, '강서구': 173, '구로구': 114, '금천구': 55, '영등포구': 108, '동작구': 86, '관악구': 135, '서초구': 179, '강남구': 165, '송파구': 174, '강동구': 129}


In [439]:
with open("dataset/park_seoul.json", "w", encoding="utf-8") as f:
    json.dump(dic_park, f, ensure_ascii=False, indent=4)

# daycare
- 기존 daycare 데이터는 서울 21개구만 존재
- https://data.seoul.go.kr/dataList/OA-15457/S/1/datasetView.do
- 2023년 daycare center

In [None]:
import pandas as pd

In [464]:
df = pd.read_csv("/home/jonas/github_wsl/apart/dataset/daycare_seoul_raw.csv")

In [466]:
dic_daycare = dict()
for idx in range(len(df)):
    data = df.iloc[idx]
    gu, num = data["자치구명"], data["시설수합계"]
    if gu != "중구":
        gu += "구"
    dic_daycare[gu] = num.item()

In [469]:
with open("dataset/daycare_seoul.json", "w", encoding="utf-8") as f:
    json.dump(dic_daycare, f, ensure_ascii=False, indent=4)

# join tables
- train, test 데이터에 구 를 key로 공원, 보육시설 통합

In [470]:
# load dataset
path_train = "dataset/train_seoul.csv"
path_test = "dataset/test_seoul.csv"
path_park = "dataset/park_seoul.json"
path_daycare = "dataset/daycare_seoul.json"

In [477]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)
print(df_train.shape)
print(df_test.shape)

(742285, 16)
(3911, 15)


In [478]:
with open(path_park) as f:
    map_park = json.load(f)
    
with open(path_daycare) as f:
    map_daycare = json.load(f)

In [479]:
df_train["park"] = df_train["gu"].map(map_park)
df_test["park"] = df_test["gu"].map(map_park)
df_train["daycare"] = df_train["gu"].map(map_daycare)
df_test["daycare"] = df_test["gu"].map(map_daycare)

In [480]:
df_train.to_csv(path_train, index=False)
df_test.to_csv(path_test, index=False)