In [2]:
import duckdb as duck
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from scipy.spatial import Voronoi
from shapely.geometry import Polygon
# import fiona
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
import os
size = os.path.getsize('myanalysis.db')
size/1024/1024/1024

In [None]:
# duckdb db 불러오기
con = duck.connect(database='myanalysis.db', read_only=False) # 한번에 하나만 연결되는 것으로 확인
# con = duck.connect()
# 메모리 110GB 제한
# 메모리 스필 관련 설정
con.execute("PRAGMA temp_directory='/tmp';")
con.execute("PRAGMA memory_limit='100GB';")

In [3]:
# 평일만 불러오기- 2024년 7월
dates = pd.date_range(start = '2024-07-01', end = '2024-07-31', freq='D')
weekday = dates[dates.weekday < 5]
weekday_strs = weekday.strftime('%Y%m%d').to_list()

In [None]:
# 평일 몇 일이었는지 확인
len(weekday_strs)

In [None]:
# sample_df = pd.read_csv('import_data/TB_KTS_DWTCD_METROPOLITAN/202407/TB_KTS_DWTCD_METROPOLITAN_20240701.csv')
# sample_df.dtypes

In [None]:
# 1. Parquet 파일 db로 이관

In [None]:
# 원본 파일 column 정보 가져오기
con.execute("desc from 'tb_transport_202407.parquet'").df()

In [None]:
# 1. 2024 7월 수단통행 DB 생성

# create_q = '''
# CREATE OR REPLACE TABLE tb_transport_202407
# (운행일자 VARCHAR(8),
# 가상카드번호 VARCHAR,
# 정산지역코드 VARCHAR,
# 교통수단코드 VARCHAR,
# 승차일시 VARCHAR,
# 정산사승차정류장ID VARCHAR,
# 하차일시 VARCHAR,
# 정산사하차정류장ID VARCHAR,
# 트랜잭션ID VARCHAR,
# 환승건수 INT,
# 이용거리 FLOAT,
# 탑승시간 FLOAT,
# 교통수단구분 VARCHAR)
# '''
# con.execute(create_q)

In [None]:
# 2. Parquet 파일 -> INSERT

# query = '''
# INSERT INTO tb_transport_202407
# SELECT *,
#     CASE
#         WHEN 교통수단코드 BETWEEN 200 AND 299
#     THEN 'T'
#         ELSE 'B'
#         END AS 교통수단구분
# FROM 'tb_transport_202407.parquet';
# '''
# con.execute(query)

# con.execute('SHOW TABLES').df()
# 테이블 설명
# >>> 1. tb_linkedtransport_202407: 7월 한달에 대한 목적통행
# >>> 2. tb_station_cluster: 정류장 - 클러스터 연계된 파일

In [None]:
# 평일 기준 테이블 구분
# query = f'''
# INSERT INTO tb_transport_202407_weekday
# SELECT *
# FROM tb_transport_202407
# WHERE 운행일자 IN {weekday_strs}
# '''
# con.execute(query)

In [None]:
# 3. 정류장ID 앞에 숫자 없애기

# query = '''
# UPDATE tb_transport_202407
# SET
#     정산사승차정류장ID = REGEXP_REPLACE(정산사승차정류장ID, '^0+', ''),
#     정산사하차정류장ID = REGEXP_REPLACE(정산사하차정류장ID, '^0+', '')
# '''
# con.execute(query).df()

In [None]:
## 1.1. 목적통행으로 집계

In [None]:
con.execute('show tables').df()

In [None]:
# # 목적통행용 테이블 생성
# query = '''
# CREATE OR REPLACE TABLE tb_linked_transport_202407
# (운행일자 VARCHAR(8),
# 가상카드번호 VARCHAR,
# 트랜잭션ID VARCHAR,
# 승차정류장ID VARCHAR,
# 하차정류장ID VARCHAR,
# 승차일시 VARCHAR,
# 하차일시 VARCHAR,
# 승차지역코드 VARCHAR,
# 하차지역코드 VARCHAR,
# 승차교통수단구분 VARCHAR,
# 하차교통수단구분 VARCHAR,
# 총이동거리 FLOAT,
# 총탑승시간 FLOAT
# )'''
# con.execute(query)
# print(con.execute('show tables').df())
# con.execute("select * from tb_linked_transport_202407 limit 10").df()

In [None]:
# 목적 통행 테이블 INSERT

# for weekday in weekday_strs:
#     query = f'''
#     INSERT INTO tb_linked_transport_202407
#     WITH summary AS(
#     SELECT 운행일자,
#         가상카드번호,
#         트랜잭션ID,
#         MAX(환승건수) AS 최대환승수,
#         SUM(이용거리) AS 총이동거리,
#         SUM(탑승시간) AS 총탑승시간
#     FROM tb_transport_202407_weekday
#     WHERE 운행일자 = {weekday}
#     GROUP BY 운행일자, 가상카드번호, 트랜잭션ID
#     )
#     SELECT s.운행일자,
#         s.가상카드번호,
#         s.트랜잭션ID,
#         t1.정산사승차정류장ID AS 승차정류장ID,
#         t2.정산사하차정류장ID AS 하차정류장ID,
#         t1.승차일시 AS 승차일시,
#         t2.하차일시 AS 하차일시,
#         t1.정산지역코드 AS 승차지역코드,
#         t2.정산지역코드 AS 하차지역코드,
#         t1.교통수단구분 AS 승차교통수단구분,
#         t2.교통수단구분 AS 하차교통수단구분,
#         s.총이동거리,
#         s.총탑승시간

#     FROM summary s
#     -- t1: 첫 통행
#     LEFT JOIN tb_transport_202407_weekday t1
#         ON s.운행일자 = t1.운행일자
#         AND s.가상카드번호 = t1.가상카드번호
#         AND s.트랜잭션ID = t1.트랜잭션ID
#         AND t1.환승건수 = 0
#     -- t2: 마지막 통행
#     LEFT JOIN tb_transport_202407_weekday t2
#         ON s.운행일자 = t2.운행일자
#         AND s.가상카드번호 = t2.가상카드번호
#         AND s.트랜잭션ID = t2.트랜잭션ID
#         AND s.최대환승수 = t2.환승건수
#     '''
#     con.query(query)
#     print(f"{weekday} db화 완료")

In [None]:
# 중간 확인용
# con.query("SELECT * FROM tb_linked_transport_202407 LIMIT 10").df()

# 779건(0.01% 원본데이터 문제 -> 드랍하고 해도 될 것)
# 데이터 정확성을 위해 드랍

# print(con.query("SELECT count(*) FROM tb_linked_transport_202407").df())
# con.query("DELETE FROM tb_linked_transport_202407 WHERE 승차일시 > 하차일시")
# print(con.query("SELECT count(*) FROM tb_linked_transport_202407").df()) # -> 16283개 드랍

In [None]:
# 가상카드번호 기준 몇 명 집계되었는지 > 총: 327,308,993행, 24,261,395명
# con.execute("select DISTINCT(가상카드번호) from tb_linked_transport_202407;").df()

In [None]:
con.execute("show tables").df()

In [None]:
# 2. 정류장 집계

In [None]:
## 2.1. 정류장,클러스터ID DB 이관

In [None]:
# 정류장 + 클러스터ID 매핑된 파일 임포트

# duckDB 내로 넣기
query = '''
CREATE OR REPLACE TABLE tb_station_cluster (
column00 BIGINT,
운행일자 BIGINT,
정산사코드 BIGINT,
지역코드 VARCHAR,
교통수단구분 VARCHAR,
정류장ID VARCHAR,
정류장명칭 VARCHAR,
법정동코드 BIGINT,
정류장ARS번호 VARCHAR,
정류장GPSY좌표 DOUBLE,
정류장GPSX좌표 DOUBLE,
geometry VARCHAR,
cluster_id INT);
INSERT INTO tb_station_cluster
SELECT *
FROM read_csv('station_labeled_2407.csv');
'''
# con.execute(query)

# 필요없는 테이블 드랍
# query = '''
# ALTER TABLE tb_station_cluster DROP COLUMN column00;
# ALTER TABLE tb_station_cluster DROP COLUMN 운행일자;
# ALTER TABLE tb_station_cluster DROP COLUMN 정산사코드;
# ALTER TABLE tb_station_cluster DROP COLUMN 정류장ARS번호;
# '''
# con.execute(query)

con.execute("SELECT * FROM tb_station_cluster limit 5").df()

In [None]:
con.execute("desc tb_station_cluster").df()
con.execute("desc tb_linked_transport_202407").df()

In [None]:
# 클러스터가 없는 목적 통행의 개수 집계 - 클러스터 해상도 확인

# 총: 327308993행 중
# 승차클러스터 NULL값 - 2686898행(0.82%)
# 승차정류장ID NULL값 - 354142행 (0.11%)
# 하차클러스터 NULL값 - 9110372행 (2.78%)
# 하차정류장ID NULL값 - 6909652행 (2.11%)

con.execute('''
SELECT COUNT(*)
FROM tb_linked_transport_202407 t
-- 1) 승차클러스터 조인
LEFT JOIN tb_station_cluster a --
    ON t.승차정류장ID = a.정류장ID
    AND t.승차지역코드 = a.지역코드
    AND t.승차교통수단구분 = a.교통수단구분
-- 2) 하차클러스터 조인
LEFT JOIN tb_station_cluster b
    ON t.하차정류장ID = b.정류장ID
    AND t.하차지역코드 = b.지역코드
    AND t.하차교통수단구분 = b.교통수단구분
WHERE 승차정류장ID IS NULL
''').df()

In [None]:
values = [2686898, 354142, 9110372, 6909652]
for value in values:
    percent = round(value/327308993 * 100, 2)
    print(percent, '%')

In [None]:
## 2.2. 정류장, 클러스터ID 결합

In [None]:
con.execute('''SELECT t.*,
     a.cluster_id AS 승차클러스터ID,
     b.cluster_id AS 하차클러스터ID
FROM tb_linked_transport_202407 t
LEFT JOIN tb_station_cluster a ON t.승차정류장ID = a.정류장ID
LEFT JOIN tb_station_cluster b ON t.하차정류장ID = b.정류장ID
LIMIT 10''').df()

In [None]:
q = '''
CREATE OR REPLACE TABLE tb_commuting_cardid_202407 AS
-- 1. 정류장 정보
WITH tagging AS(
SELECT t.*,
     a.cluster_id AS 승차클러스터ID,
     b.cluster_id AS 하차클러스터ID
FROM tb_linked_transport_202407 t
LEFT JOIN tb_station_cluster a
    ON t.승차정류장ID = a.정류장ID
    AND t.승차지역코드 = a.지역코드
    AND t.승차교통수단구분 = a.교통수단구분
LEFT JOIN tb_station_cluster b
    ON t.하차정류장ID = b.정류장ID
    AND t.하차지역코드 = b.지역코드
    AND t.하차교통수단구분 = b.교통수단구분
),


-- 2. 오전 6~12시 승차 기준 주거클러스터 후보
am_dep_freq AS(
    SELECT
    가상카드번호,
    승차클러스터ID
    , COUNT(*) AS freq
    FROM tagging
    WHERE SUBSTR(승차일시, 9, 2)::INT BETWEEN 6 AND 12
    GROUP BY 가상카드번호, 승차클러스터ID
),
am_dep_mode AS(
    SELECT
    f.가상카드번호,
    f.승차클러스터ID AS am_dep_cluster
    FROM am_dep_freq f
    JOIN(
        SELECT
        가상카드번호, MAX(freq) AS maxf
        FROM am_dep_freq
        GROUP BY 가상카드번호
    ) m
    ON f.가상카드번호 = m.가상카드번호 AND f.freq = m.maxf
),

-- 3. 오후 16-24시 하차 기준 주거클러스터 후보
pm_arr_freq AS(
    SELECT 가상카드번호, 하차클러스터ID, COUNT(*) AS freq
    FROM tagging
    WHERE SUBSTR(하차일시, 9,2)::INT BETWEEN 16 AND 24
    GROUP BY 가상카드번호, 하차클러스터ID
),
pm_arr_mode AS(
    SELECT
    f.가상카드번호,
    f.하차클러스터ID AS pm_arr_cluster
    FROM pm_arr_freq f
    JOIN(
        SELECT 가상카드번호, MAX(freq) AS maxf
        FROM pm_arr_freq
        GROUP BY 가상카드번호
        ) m
    ON f.가상카드번호 = m.가상카드번호 AND f.freq = m.maxf
),

-- 4. 오전 6~12시 하차 기준 업무클러스터 후보
am_arr_freq AS(
    SELECT
    가상카드번호,
    하차클러스터ID
    , COUNT(*) AS freq
    FROM tagging
    WHERE SUBSTR(하차일시, 9, 2)::INT BETWEEN 6 AND 12
    GROUP BY 가상카드번호, 하차클러스터ID
),
am_arr_mode AS(
    SELECT
    f.가상카드번호,
    f.하차클러스터ID AS am_arr_cluster
    FROM am_arr_freq f
    JOIN(
        SELECT 가상카드번호, MAX(freq) AS maxf
        FROM am_arr_freq
        GROUP BY 가상카드번호
    ) m
    ON f.가상카드번호 = m.가상카드번호 AND f.freq = m.maxf
),

-- 5. 오후 16~24시 승차 기준 업무클러스터 후보
pm_dep_freq AS(
    SELECT 가상카드번호, 승차클러스터ID, COUNT(*) AS freq
    FROM tagging
    WHERE SUBSTR(승차일시, 9,2)::INT BETWEEN 16 AND 24
    GROUP BY 가상카드번호, 승차클러스터ID
),
pm_dep_mode AS(
    SELECT
    f.가상카드번호,
    f.승차클러스터ID AS pm_dep_cluster
    FROM pm_dep_freq f
    JOIN(
        SELECT 가상카드번호, MAX(freq) AS maxf
        FROM pm_dep_freq
        GROUP BY 가상카드번호
        ) m
    ON f.가상카드번호 = m.가상카드번호 AND f.freq = m.maxf
),

-- 6. 최종 결과물 테이블
final AS (
    SELECT
    amd.가상카드번호,
    amd.am_dep_cluster,
    pma.pm_arr_cluster,
    ama.am_arr_cluster,
    pmd.pm_dep_cluster,

    -- 주거클러스터 조건
    CASE WHEN amd.am_dep_cluster = pma.pm_arr_cluster THEN amd.am_dep_cluster END AS 주거지클러스터,

    -- 업무클러스터 조건
    CASE WHEN ama.am_arr_cluster = pmd.pm_dep_cluster THEN ama.am_arr_cluster END AS 업무지클러스터
    FROM am_dep_mode amd
    LEFT JOIN pm_arr_mode pma
        ON amd.가상카드번호 = pma.가상카드번호
    LEFT JOIN am_arr_mode ama
        ON amd.가상카드번호 = ama.가상카드번호
    LEFT JOIN pm_dep_mode pmd
        ON amd.가상카드번호 = pmd.가상카드번호
)
SELECT
    가상카드번호,
    MIN(주거지클러스터) AS 주거지클러스터, -- 중복 제거
    MIN(업무지클러스터) AS 업무지클러스터
FROM final
GROUP BY 가상카드번호
HAVING
    MIN(주거지클러스터) IS NOT NULL
    AND MIN(업무지클러스터) IS NOT NULL
'''
con.execute(q).df()

In [None]:
con.execute('select DISTINCT(가상카드번호) from tb_commuting_cardid_202407').df()
#1720~

In [None]:
## 2.3. 출퇴근 정류장 기반 집계

In [None]:
con.execute('show tables').df()

In [None]:
con.execute('select * from tb_linked_transport_202407 limit 1').df()

In [None]:
con.execute('select * from tb_station_cluster limit 1').df()

In [None]:
query = '''
-- 1. 출근 통행 테이블
CREATE OR REPLACE TABLE tb_morning_commute_transport_202407
(live_station_id VARCHAR,
work_station_id VARCHAR,
live_station_name VARCHAR,
work_station_name VARCHAR,
live_station_type VARCHAR,
work_station_type VARCHAR,
live_station_admin VARCHAR,
work_station_admin VARCHAR,
live_station_x DOUBLE,
live_station_y DOUBLE,
work_station_x DOUBLE,
work_station_y DOUBLE,
morning_commute_count INT,
morning_commute_average_time FLOAT,
morning_commute_median_time FLOAT,
morning_commute_average_distance FLOAT,
morning_commute_median_distance FLOAT);

-- 2. 퇴근 통행 테이블
CREATE OR REPLACE TABLE tb_evening_commute_transport_202407
(live_station_id VARCHAR,
work_station_id VARCHAR,
live_station_name VARCHAR,
work_station_name VARCHAR,
live_station_type VARCHAR,
work_station_type VARCHAR,
live_station_admin VARCHAR,
work_station_admin VARCHAR,
live_station_x DOUBLE,
live_station_y DOUBLE,
work_station_x DOUBLE,
work_station_y DOUBLE,
evening_commute_count INT,
evening_commute_average_time FLOAT,
evening_commute_median_time FLOAT,
evening_commute_average_distance FLOAT,
evening_commute_median_distance FLOAT);
'''
con.execute(query)

In [None]:
query = '''
-- 1. 정류장 정보
INSERT INTO tb_morning_commute_transport_202407
WITH tagging AS(
SELECT t.*,
     a.cluster_id AS 승차클러스터ID,
     b.cluster_id AS 하차클러스터ID,
     a.정류장명칭 AS 승차정류장명칭,
     b.정류장명칭 AS 하차정류장명칭,
     a.법정동코드 AS 승차법정동코드,
     b.법정동코드 aS 하차법정동코드,
     a.정류장GPSX좌표 AS 승차정류장_x,
     a.정류장GPSY좌표 AS 승차정류장_y,
     b.정류장GPSX좌표 AS 하차정류장_x,
     b.정류장GPSY좌표 AS 하차정류장_y
FROM tb_linked_transport_202407 t
LEFT JOIN tb_station_cluster a
    ON t.승차정류장ID = a.정류장ID
    AND t.승차지역코드 = a.지역코드
    AND t.승차교통수단구분 = a.교통수단구분
LEFT JOIN tb_station_cluster b
    ON t.하차정류장ID = b.정류장ID
    AND t.하차지역코드 = b.지역코드
    AND t.하차교통수단구분 = b.교통수단구분
)

-- 2. 출근 통행 테이블

SELECT t.승차정류장ID AS live_station_id,
    t.하차정류장ID AS work_station_id,
    t.승차정류장명칭 AS live_station_name,
    t.하차정류장명칭 AS work_station_name,
    t.승차교통수단구분 AS live_station_type,
    t.하차교통수단구분 AS work_station_type,
    t.승차법정동코드 AS live_station_admin,
    t.하차법정동코드 AS work_station_admin,
    t.승차정류장_x AS live_station_x,
    t.승차정류장_y AS live_station_y,
    t.하차정류장_x AS work_station_x,
    t.하차정류장_y AS work_station_y,
    COUNT(*) AS morning_commute_count,
    AVG(t.총탑승시간) AS morning_commute_averge_time,
    MEDIAN(t.총탑승시간) AS morning_commute_median_time,
    AVG(t.총이동거리) AS morning_commute_averge_distance,
    MEDIAN(t.총이동거리) AS morning_commute_median_distance
FROM tagging t
JOIN tb_commuting_cardid_202407 c
    ON t.가상카드번호 = c.가상카드번호
    AND t.승차클러스터ID = c.주거지클러스터
    AND t.하차클러스터ID = c.업무지클러스터
GROUP BY t.승차정류장ID, t.하차정류장ID, t.승차교통수단구분, t.하차교통수단구분, t.승차정류장명칭,
    t.하차정류장명칭, t.승차법정동코드, t.하차법정동코드,  t.하차정류장_x, t.하차정류장_y, t.승차정류장_x, t.승차정류장_y

'''
con.execute(query)

In [None]:
query = '''
-- 1. 정류장 정보
INSERT INTO tb_evening_commute_transport_202407
WITH tagging AS(
SELECT t.*,
     a.cluster_id AS 승차클러스터ID,
     b.cluster_id AS 하차클러스터ID,
     a.정류장명칭 AS 승차정류장명칭,
     b.정류장명칭 AS 하차정류장명칭,
     a.법정동코드 AS 승차법정동코드,
     b.법정동코드 AS 하차법정동코드,
     a.정류장GPSX좌표 AS 승차정류장_x,
     a.정류장GPSY좌표 AS 승차정류장_y,
     b.정류장GPSX좌표 AS 하차정류장_x,
     b.정류장GPSY좌표 AS 하차정류장_y
FROM tb_linked_transport_202407 t
LEFT JOIN tb_station_cluster a
    ON t.승차정류장ID = a.정류장ID
    AND t.승차지역코드 = a.지역코드
    AND t.승차교통수단구분 = a.교통수단구분
LEFT JOIN tb_station_cluster b
    ON t.하차정류장ID = b.정류장ID
    AND t.하차지역코드 = b.지역코드
    AND t.하차교통수단구분 = b.교통수단구분
)
-- 2. 퇴근 통행 테이블
SELECT t.하차정류장ID AS live_station_id,
    t.승차정류장ID AS work_station_id,
    t.승차정류장명칭 AS live_station_name,
    t.하차정류장명칭 AS work_station_name,
    t.승차교통수단구분 AS live_station_type,
    t.하차교통수단구분 AS work_station_type,
    t.승차법정동코드 AS live_station_admin,
    t.하차법정동코드 AS work_station_admin,
    t.하차정류장_x AS live_station_x,
    t.하차정류장_y AS live_station_y,
    t.승차정류장_x AS work_station_x,
    t.승차정류장_y AS work_station_y,
    COUNT(*) AS evening_commute_count,
    AVG(t.총탑승시간) AS evening_commute_averge_time,
    MEDIAN(t.총탑승시간) AS evening_commute_median_time,
    AVG(t.총이동거리) AS evening_commute_averge_distance,
    MEDIAN(t.총이동거리) AS evening_commute_median_distance
FROM tagging t
JOIN tb_commuting_cardid_202407 c
    ON t.가상카드번호 = c.가상카드번호
    AND t.승차클러스터ID = c.업무지클러스터
    AND t.하차클러스터ID = c.주거지클러스터
GROUP BY t.승차정류장ID, t.하차정류장ID, t.승차교통수단구분, t.하차교통수단구분, t.승차정류장명칭,
    t.하차정류장명칭, t.승차법정동코드, t.하차법정동코드, t.하차정류장_x, t.하차정류장_y, t.승차정류장_x, t.승차정류장_y
'''

con.execute(query)

In [None]:
# con.execute('SELECT * FROM tb_commuting_transport_202407 LIMIT 10').df()
df_morning = con.execute('SELECT * FROM tb_morning_commute_transport_202407').df()
df_evening = con.execute('SELECT * FROM tb_evening_commute_transport_202407').df()

print(df_morning.isnull().sum())
print(df_evening.isnull().sum())

In [None]:
print(len(df_morning))
print(len(df_evening))

In [None]:
df_morning.head()

In [None]:
# 중복여부 확인 - 정류장ID, 좌표, 교통수단구분

In [None]:
# 주거지정류장ID-업무지정류장ID 중복 여부 확인
# 정류장ID, 좌표, 교통수단구분 적용 시 중복 여부 없음
df_morning[df_morning.duplicated(subset = ['live_station_id', 'work_station_id', 'live_station_x', 'live_station_y', 'work_station_x', 'work_station_y', 'live_station_type', 'work_station_type'], keep=False)].sort_values(['live_station_id', 'work_station_id'])
df_evening[df_evening.duplicated(subset = ['live_station_id', 'work_station_id', 'live_station_x', 'live_station_y', 'work_station_x', 'work_station_y', 'live_station_type', 'work_station_type'], keep=False)].sort_values(['live_station_id', 'work_station_id'])

In [None]:
# CSV로 추출
query = '''
COPY tb_morning_commute_transport_202407 TO 'output/morning_commuting_transport_202407.parquet' (FORMAT PARQUET);
COPY tb_evening_commute_transport_202407 TO 'output/evening_commuting_transport_202407.parquet' (FORMAT PARQUET);
'''
con.execute(query)