### Match KGSS's blockno with the name of towns
Last modified : Dec 26 2020 by Imryoung Jeong (neptune0118@gmail.com)

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import re

###### Functions
---

In [16]:
# This function creates the (official) list of towns from shp files
def admin_list(data):
    
    data['cty_cd'] = data['adm_dr_cd'].str[:5]
    data['prv_cd'] = data['adm_dr_cd'].str[:2]
    
    data = data.merge(cty, how = 'left', left_on = 'cty_cd', right_on = 'sigungu_cd')
    data = data.merge(prv, how = 'left', left_on = 'prv_cd', right_on = 'sido_cd')
    
    data.drop(['geometry_x', 'base_year_y', 'sigungu_cd', 
               'geometry_y', 'base_year_x', 'sido_cd', 'geometry', 'base_year',
               'cty_cd', 'prv_cd'], axis = 1,
              inplace = True)
    
    data.rename(columns = {'adm_dr_cd':'twn_cd',
                          'adm_dr_nm':'twn_nm',
                          'sigungu_nm':'cty_nm',
                          'sido_nm':'prv_nm'}, 
                inplace = True)
    
    data['id'] = data.apply(lambda x: func(x['prv_nm']), axis = 1) + "_" + data['twn_nm']
    
    return data    

In [None]:
# This function cleans the pattern of KGSS province names
def func(prv_full_nm):
    
    # 서울특별시, 부산광역시 등 -> 서울, 부산 / 강원도, 경기도, 제주도 -> 강원, 경기, 제주
    if re.search('[가-힣]+[시]$', str(prv_full_nm)) or len(str(prv_full_nm)) == 3:
        return str(prv_full_nm)[:2]
    
    # 전라남도 등 -> 전남
    else : 
        return str(prv_full_nm)[0] + str(prv_full_nm[2])

In [18]:
# This function extracts county name from KGSS's raw address
def addr_cty(raw):
    
    regex1 = re.search('([가-힣]+[시|군|구])([가-힣]+[0-9]?[읍|면|동])', str(raw)) # e.g. 전주시평화2동
    regex2 = re.search('([가-힣]+[시|군|구])$', str(raw)) # e.g. 부안군
    regex3 = re.search('([가-힣]+[0-9]?[읍|면|동])$', str(raw)) # e.g. 화북동
    
    if regex1 :
        return regex1.group(1)
    elif regex2 :
        return regex2.group(1)
    elif regex3 :
        return ''

# This function extracts town name from KGSS's raw address
def addr_twn(raw):
    
    regex1 = re.search('([가-힣]+[시|군|구])([가-힣]+[0-9]?[읍|면|동])', str(raw)) # e.g. 전주시평화2동
    regex2 = re.search('([가-힣]+[시|군|구])$', str(raw)) # e.g. 부안군
    regex3 = re.search('([가-힣]+[0-9]?[읍|면|동])$', str(raw)) # e.g. 화북동
    
    if regex1 :
        return regex1.group(2)
    elif regex2 :
        return ''
    elif regex3 :
        return regex3.group(1)


# This function cleans raw address of KGSS 
def blockno_list(data) :
    
    data['prv'] = data['addr_raw'].str[:2]
    data['addr_raw'] = data['addr_raw'].str[2:]
    
    data['cty'] = data.apply(lambda x: addr_cty(x['addr_raw']), axis = 1)
    data['twn'] = data.apply(lambda x: addr_twn(x['addr_raw']), axis = 1)
    
    data['id'] = data['prv'] + "_" + data['twn']
    
    data.drop(columns = ['addr_raw'], inplace=True)
    
    return data

---

In [20]:
route = "/Users/imryoung/Dropbox/gis_maps/"

# This file matches blockno and town names of KGSS
blockno = "/Users/imryoung/Dropbox/admin_tracking/blockno_code_2003-2013.csv"

out_route = "/Users/imryoung/Dropbox/admin_tracking/1_kgss_cleaned/"

In [21]:
for i in range(2004, 2005):
    
    
    # 1. Set up
    usecols = ['blockno', str(i)]

    shp_twn= route + "/census_township/census_township_{}/bnd_dong_00_{}.shp".format(i,i)
    shp_cty= route + "/census_county/census_county_{}/bnd_sigungu_00_{}.shp".format(i,i)
    shp_prv= route + "/census_province/census_province_{}/bnd_sido_00_{}.shp".format(i,i)
    
    out_name = "kgss_cleaned_{}.csv".format(i)
    
    
    # 2. Import files
    twn = pd.DataFrame(gpd.read_file(shp_twn, encoding = 'euc-kr'))
    cty = pd.DataFrame(gpd.read_file(shp_cty, encoding = 'euc-kr'))
    cty.columns = cty.columns.str.lower()
    prv = pd.DataFrame(gpd.read_file(shp_prv, encoding = 'euc-kr'))
    

    df = pd.read_csv(blockno, usecols = usecols)
    df.rename(columns = {str(i):'addr_raw'}, inplace=True)
    
    
    # 3. Extract the town list from shp file
    admin = admin_list(twn)
    
    # 4. Clean the KGSS addresses
    df = blockno_list(df)
    
    # 5. Merge blockno and the cleaned addresses above
    df = df.merge(admin, how = 'left', left_on = 'id', right_on = 'id', indicator = True)
    df['_merge'].value_counts()
    
    # 6. Export the result
    df.to_csv(out_route + out_name)