# 6장. 파이썬과 MySQL로 구축하는 주소 데이터베이스 

- 작성자: 박하람
- 작성일자: 2024.02.22

## 6.2 프로젝트 환경 구축하기 

### 파이썬 노트북으로 MySQL 연결하기

In [1]:
!pip3 install pymysql



In [139]:
import pymysql

def init_connection():
    connection = pymysql.connect(
        host="localhost",
        port=3306,
        user="root",
        password="root",
        cursorclass=pymysql.cursors.DictCursor,
        charset="utf8",
    )
    return connection

sql = "SHOW DATABASES;"
conn = init_connection()

with conn:
    with conn.cursor() as cur:
        cur.execute(sql)
        for data in cur:
            print(data)

{'Database': 'information_schema'}
{'Database': 'mysql'}
{'Database': 'performance_schema'}
{'Database': 'sys'}


## 6.3 데이터베이스 생성하기

### 데이터베이스 생성하기

In [142]:
conn = init_connection()
sql = "CREATE DATABASE address DEFAULT CHARACTER SET utf8;"

with conn:
    with conn.cursor() as cur:
        cur.execute(sql)
        conn.commit()

In [143]:
conn = init_connection()

sql = "SHOW DATABASES;"

with conn:
    with conn.cursor() as cur:
        cur.execute(sql)
        for data in cur:
            print(data)

{'Database': 'address'}
{'Database': 'information_schema'}
{'Database': 'mysql'}
{'Database': 'performance_schema'}
{'Database': 'sys'}


## 6.4 테이블 생성하기

### 테이블 생성하기

In [144]:
def init_db_connection():
    connection = pymysql.connect(
        host="localhost",
        port=3306,
        user="root",
        password="root",
        database="address",
        cursorclass=pymysql.cursors.DictCursor,
        charset="utf8",
        local_infile=True
    )
    return connection

def query_get(sql):
    connection = init_db_connection()
    with connection:
        with connection.cursor() as cursor:
            cursor.execute(sql)
            return cursor.fetchall()

def query_update(sql):
    connection = init_db_connection()
    with connection:
        with connection.cursor() as cursor:
            cursor.execute(sql)
            connection.commit()
            return True

In [160]:
sql = '''
    CREATE TABLE `rnaddrkor` (
        `도로명주소관리번호` varchar(26) NOT NULL,
        `법정동코드` varchar(10),
        `시도명` varchar(40),
        `시군구명` varchar(40),
        `읍면동명` varchar(40),
        `리명` varchar(40),
        `산여부` varchar(1),
        `번지` varchar(4),
        `호` varchar(4),
        `도로명코드` varchar(12) NOT NULL,
        `도로명` varchar(80),
        `지하여부` varchar(1) NOT NULL,
        `건물본번` int(5) ZEROFILL NOT NULL,
        `건물부번` int(5) ZEROFILL NOT NULL,
        `행정동코드` varchar(60),
        `행정동명` varchar(60),
        `기초구역번호(우편번호)` varchar(5),
        `이전도로명주소` varchar(400),
        `효력발생일` varchar(8),
        `공동주택구분` varchar(1),
        `이동사유코드` varchar(2),
        `건축물대장건물명` varchar(400),
        `시군구용건물명` varchar(400),
        `비고` varchar(200),
        PRIMARY KEY (`도로명주소관리번호`, `도로명코드`, `지하여부`, `건물본번`, `건물부번`)
    );
'''

query_update(sql)

True

In [161]:
sql = '''
    CREATE TABLE `rnaddrkor_jibun` (
        `도로명주소관리번호` varchar(26) NOT NULL,
        `법정동코드` varchar(10) NOT NULL,
        `시도명` varchar(40),
        `시군구명` varchar(40),
        `법정읍면동명` varchar(40),
        `법정리명` varchar(40),
        `산여부` varchar(1) NOT NULL,
        `지번본번(번지)` int(4) ZEROFILL NOT NULL,
        `지번부번(호)` int(4) ZEROFILL NOT NULL,
        `도로명코드` varchar(12),
        `지하여부` varchar(1),
        `건물본번` int(5),
        `건물부번` int(5),
        `이동사유코드` varchar(2),
        PRIMARY KEY (`도로명주소관리번호`, `법정동코드`, `산여부`, `지번본번(번지)`, `지번부번(호)`)
    );
'''

query_update(sql)

True

In [162]:
query_get("SHOW TABLES;")

[{'Tables_in_address': 'rnaddrkor'}, {'Tables_in_address': 'rnaddrkor_jibun'}]

In [163]:
query_get("DESC rnaddrkor;")

[{'Field': '도로명주소관리번호',
  'Type': 'varchar(26)',
  'Null': 'NO',
  'Key': 'PRI',
  'Default': None,
  'Extra': ''},
 {'Field': '법정동코드',
  'Type': 'varchar(10)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '시도명',
  'Type': 'varchar(40)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '시군구명',
  'Type': 'varchar(40)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '읍면동명',
  'Type': 'varchar(40)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '리명',
  'Type': 'varchar(40)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '산여부',
  'Type': 'varchar(1)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '번지',
  'Type': 'varchar(4)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '호',
  'Type': 'varchar(4)',
  'Null': 'YES',
  'Key': '',
  'Default': None,
  'Extra': ''},
 {'Field': '도로명코드',
  '

## 6.5 데이터 삽입하기

### local infile 허용하기

In [164]:
def init_db_connection():
    connection = pymysql.connect(
        host="localhost",
        port=3306,
        user="root",
        password="root",
        database="address",
        cursorclass=pymysql.cursors.DictCursor,
        charset="utf8",
        local_infile=True # 클라이언스 상에서 조건 추가
    )
    return connection

### 데이터 경로 확인하기

In [165]:
import glob

file_list = glob.glob("data/rnaddrkor/rnaddrkor_*.txt")

for file in file_list:
    print(file)

data/rnaddrkor/rnaddrkor_sejong.txt
data/rnaddrkor/rnaddrkor_jeonnam.txt
data/rnaddrkor/rnaddrkor_seoul.txt
data/rnaddrkor/rnaddrkor_ulsan.txt
data/rnaddrkor/rnaddrkor_gyeongnam.txt
data/rnaddrkor/rnaddrkor_incheon.txt
data/rnaddrkor/rnaddrkor_chungnam.txt
data/rnaddrkor/rnaddrkor_daejeon.txt
data/rnaddrkor/rnaddrkor_daegu.txt
data/rnaddrkor/rnaddrkor_chungbuk.txt
data/rnaddrkor/rnaddrkor_gangwon.txt
data/rnaddrkor/rnaddrkor_gyunggi.txt
data/rnaddrkor/rnaddrkor_gwangju.txt
data/rnaddrkor/rnaddrkor_jeju.txt
data/rnaddrkor/rnaddrkor_busan.txt
data/rnaddrkor/rnaddrkor_gyeongbuk.txt
data/rnaddrkor/rnaddrkor_jeonbuk.txt


### 데이터 업로드하기

#### 도로명주소 업로드하기

In [167]:
import os
from tqdm import tqdm

total_line_count = 0  # 총 행수를 저장할 변수

for file in tqdm(file_list, desc='Processing files'):
    file_path = os.path.abspath(file)

    # cp949로 파일 읽고 행 불러오기
    with open(file_path, 'r', encoding='cp949', errors='ignore') as f:
        lines = f.readlines()
    
    # 행 수 계산
    line_count = len(lines)
    
    # 총 행 수에 더하기
    total_line_count += line_count

    # cp949로 읽은 파일을 utf8로 임시파일에 저장
    temp_file_path = "temp_file.txt"
    with open(temp_file_path, 'w', encoding='utf8') as f:
        f.writelines(lines)

    sql = f'''
        LOAD DATA LOCAL INFILE "{temp_file_path}" INTO TABLE rnaddrkor
        FIELDS TERMINATED BY "|";
    '''
    print(f"Processing file: {file_path}, Number of lines: {line_count}")
    query_put(sql)

    # 임시파일 삭제
    os.remove(temp_file_path)

print(f"Total number of lines: {total_line_count}")

Processing files:   0%|          | 0/17 [00:00<?, ?it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_sejong.txt, Number of lines: 27510


Processing files:   6%|▌         | 1/17 [00:00<00:10,  1.49it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_jeonnam.txt, Number of lines: 598298


Processing files:  12%|█▏        | 2/17 [00:10<01:34,  6.29s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_seoul.txt, Number of lines: 530721


Processing files:  18%|█▊        | 3/17 [00:18<01:34,  6.72s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_ulsan.txt, Number of lines: 104458


Processing files:  24%|██▎       | 4/17 [00:19<01:01,  4.69s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_gyeongnam.txt, Number of lines: 653480


Processing files:  29%|██▉       | 5/17 [00:29<01:17,  6.46s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_incheon.txt, Number of lines: 186039


Processing files:  35%|███▌      | 6/17 [00:31<00:55,  5.07s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_chungnam.txt, Number of lines: 491608


Processing files:  41%|████      | 7/17 [00:40<01:02,  6.25s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_daejeon.txt, Number of lines: 114026


Processing files:  47%|████▋     | 8/17 [00:42<00:43,  4.86s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_daegu.txt, Number of lines: 229727


Processing files:  53%|█████▎    | 9/17 [00:45<00:34,  4.35s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_chungbuk.txt, Number of lines: 335225


Processing files:  59%|█████▉    | 10/17 [00:51<00:33,  4.73s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_gangwon.txt, Number of lines: 364686


Processing files:  65%|██████▍   | 11/17 [00:56<00:29,  4.96s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_gyunggi.txt, Number of lines: 1020563


Processing files:  71%|███████   | 12/17 [01:18<00:51, 10.22s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_gwangju.txt, Number of lines: 120474


Processing files:  76%|███████▋  | 13/17 [01:20<00:30,  7.67s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_jeju.txt, Number of lines: 155426


Processing files:  82%|████████▏ | 14/17 [01:22<00:17,  5.96s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_busan.txt, Number of lines: 301822


Processing files:  88%|████████▊ | 15/17 [01:26<00:10,  5.34s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_gyeongbuk.txt, Number of lines: 713862


Processing files:  94%|█████████▍| 16/17 [01:37<00:07,  7.06s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/rnaddrkor_jeonbuk.txt, Number of lines: 437063


Processing files: 100%|██████████| 17/17 [01:43<00:00,  6.12s/it]

Total number of lines: 6384988





In [169]:
sql = "SELECT * FROM rnaddrkor LIMIT 5;"
query_get(sql)

[{'도로명주소관리번호': '11110101310001200009400000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '읍면동명': '청운동',
  '리명': '',
  '산여부': '0',
  '번지': '144',
  '호': '3',
  '도로명코드': '111103100012',
  '도로명': '자하문로',
  '지하여부': '0',
  '건물본번': 94,
  '건물부번': 0,
  '행정동코드': '1111051500',
  '행정동명': '청운효자동',
  '기초구역번호(우편번호)': '03047',
  '이전도로명주소': '',
  '효력발생일': '20110729',
  '공동주택구분': '0',
  '이동사유코드': '',
  '건축물대장건물명': '',
  '시군구용건물명': '',
  '비고': ''},
 {'도로명주소관리번호': '11110101310001200009600000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '읍면동명': '청운동',
  '리명': '',
  '산여부': '0',
  '번지': '108',
  '호': '14',
  '도로명코드': '111103100012',
  '도로명': '자하문로',
  '지하여부': '0',
  '건물본번': 96,
  '건물부번': 0,
  '행정동코드': '1111051500',
  '행정동명': '청운효자동',
  '기초구역번호(우편번호)': '03047',
  '이전도로명주소': '',
  '효력발생일': '20110729',
  '공동주택구분': '1',
  '이동사유코드': '',
  '건축물대장건물명': '',
  '시군구용건물명': '평안빌',
  '비고': ''},
 {'도로명주소관리번호': '11110101310001200009800000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  

In [168]:
sql = "SELECT COUNT(*) FROM rnaddrkor;"
query_get(sql)

[{'COUNT(*)': 6384988}]

#### 지번주소 업로드하기

In [170]:
import glob

file_list = glob.glob("data/rnaddrkor/jibun_*.txt")

for file in file_list:
    print(file)

data/rnaddrkor/jibun_rnaddrkor_daegu.txt
data/rnaddrkor/jibun_rnaddrkor_jeju.txt
data/rnaddrkor/jibun_rnaddrkor_chungbuk.txt
data/rnaddrkor/jibun_rnaddrkor_gyunggi.txt
data/rnaddrkor/jibun_rnaddrkor_gwangju.txt
data/rnaddrkor/jibun_rnaddrkor_gangwon.txt
data/rnaddrkor/jibun_rnaddrkor_gyeongbuk.txt
data/rnaddrkor/jibun_rnaddrkor_jeonbuk.txt
data/rnaddrkor/jibun_rnaddrkor_ulsan.txt
data/rnaddrkor/jibun_rnaddrkor_seoul.txt
data/rnaddrkor/jibun_rnaddrkor_gyeongnam.txt
data/rnaddrkor/jibun_rnaddrkor_jeonnam.txt
data/rnaddrkor/jibun_rnaddrkor_busan.txt
data/rnaddrkor/jibun_rnaddrkor_incheon.txt
data/rnaddrkor/jibun_rnaddrkor_chungnam.txt
data/rnaddrkor/jibun_rnaddrkor_sejong.txt
data/rnaddrkor/jibun_rnaddrkor_daejeon.txt


In [172]:
import os
from tqdm import tqdm

total_line_count = 0  # 총 행수를 저장할 변수

for file in tqdm(file_list, desc='Processing files'):
    file_path = os.path.abspath(file)

    # cp949로 파일 읽고 행 불러오기
    with open(file_path, 'r', encoding='cp949', errors='ignore') as f:
        lines = f.readlines()
    
    # 행 수 계산
    line_count = len(lines)
    
    # 총 행 수에 더하기
    total_line_count += line_count

    # cp949로 읽은 파일을 utf8로 임시파일에 저장
    temp_file_path = "temp_file.txt"
    with open(temp_file_path, 'w', encoding='utf8') as f:
        f.writelines(lines)

    sql = f'''
        LOAD DATA LOCAL INFILE "{temp_file_path}" INTO TABLE rnaddrkor_jibun
        FIELDS TERMINATED BY "|";
    '''
    print(f"Processing file: {file_path}, Number of lines: {line_count}")
    query_put(sql)

    # 임시파일 삭제
    os.remove(temp_file_path)

print(f"Total number of lines: {total_line_count}")

Processing files:   0%|          | 0/17 [00:00<?, ?it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_daegu.txt, Number of lines: 28049


Processing files:   6%|▌         | 1/17 [00:00<00:08,  1.92it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_jeju.txt, Number of lines: 19240


Processing files:  12%|█▏        | 2/17 [00:00<00:05,  2.91it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_chungbuk.txt, Number of lines: 84792


Processing files:  18%|█▊        | 3/17 [00:01<00:08,  1.67it/s]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_gyunggi.txt, Number of lines: 285178


Processing files:  29%|██▉       | 5/17 [00:05<00:13,  1.16s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_gwangju.txt, Number of lines: 17851
Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_gangwon.txt, Number of lines: 178859


Processing files:  35%|███▌      | 6/17 [00:07<00:17,  1.58s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_gyeongbuk.txt, Number of lines: 143788


Processing files:  41%|████      | 7/17 [00:09<00:17,  1.80s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_jeonbuk.txt, Number of lines: 114937


Processing files:  47%|████▋     | 8/17 [00:11<00:15,  1.71s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_ulsan.txt, Number of lines: 18629


Processing files:  53%|█████▎    | 9/17 [00:11<00:10,  1.25s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_seoul.txt, Number of lines: 86986


Processing files:  59%|█████▉    | 10/17 [00:12<00:08,  1.19s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_gyeongnam.txt, Number of lines: 300283


Processing files:  65%|██████▍   | 11/17 [00:17<00:13,  2.21s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_jeonnam.txt, Number of lines: 308309


Processing files:  71%|███████   | 12/17 [00:21<00:13,  2.77s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_busan.txt, Number of lines: 42193


Processing files:  76%|███████▋  | 13/17 [00:22<00:09,  2.34s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_incheon.txt, Number of lines: 31745


Processing files:  82%|████████▏ | 14/17 [00:23<00:05,  1.77s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_chungnam.txt, Number of lines: 186499


Processing files:  88%|████████▊ | 15/17 [00:25<00:03,  1.92s/it]

Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_sejong.txt, Number of lines: 5327
Processing file: /Users/harampark/Desktop/labs/3.study/address-using-guide/code/chapter-6/data/rnaddrkor/jibun_rnaddrkor_daejeon.txt, Number of lines: 11926


Processing files: 100%|██████████| 17/17 [00:25<00:00,  1.50s/it]

Total number of lines: 1864591





In [173]:
sql = "SELECT * FROM rnaddrkor_jibun LIMIT 5;"
query_get(sql)

[{'도로명주소관리번호': '11110101310001200009900004',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '법정읍면동명': '청운동',
  '법정리명': '',
  '산여부': '0',
  '지번본번(번지)': 130,
  '지번부번(호)': 3,
  '도로명코드': '111103100012',
  '지하여부': '0',
  '건물본번': 99,
  '건물부번': 4,
  '이동사유코드': ''},
 {'도로명주소관리번호': '11110101310001200010100000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '법정읍면동명': '청운동',
  '법정리명': '',
  '산여부': '0',
  '지번본번(번지)': 129,
  '지번부번(호)': 2,
  '도로명코드': '111103100012',
  '지하여부': '0',
  '건물본번': 101,
  '건물부번': 0,
  '이동사유코드': ''},
 {'도로명주소관리번호': '11110101310001200010100000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '법정읍면동명': '청운동',
  '법정리명': '',
  '산여부': '0',
  '지번본번(번지)': 129,
  '지번부번(호)': 3,
  '도로명코드': '111103100012',
  '지하여부': '0',
  '건물본번': 101,
  '건물부번': 0,
  '이동사유코드': ''},
 {'도로명주소관리번호': '11110101310001200010100000',
  '법정동코드': '1111010100',
  '시도명': '서울특별시',
  '시군구명': '종로구',
  '법정읍면동명': '청운동',
  '법정리명': '',
  '산여부': '0',
  '지번본번(번지)': 131,
  '지번부번(

In [174]:
sql = "SELECT COUNT(*) FROM rnaddrkor_jibun;"
query_get(sql)

[{'COUNT(*)': 1864591}]

## 6.6 SQL로 데이터 분석하기