In [None]:
import duckdb as duck
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
from scipy.spatial import Voronoi
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

from teenager_cluster import *

In [None]:
# duckdb db 불러오기
con = duck.connect(database='myanalysis.db', read_only=False) # 한번에 하나만 연결되는 것으로 확인
# con = duck.connect()

# 메모리 110GB 제한
# 메모리 스필 관련 설정
con.execute("PRAGMA temp_directory='/tmp';")
con.execute("PRAGMA memory_limit='100GB';")

# 1. 학구도 & 학교 데이터 import

In [None]:
high_hakgudo = gpd.read_file('upload/HighschoolHakgudo.shp', encoding='euc-kr')
middle_hakgudo = gpd.read_file('upload/MiddleSchoolHakgudo.shp', encoding='euc-kr')
school_map = pd.read_csv('upload/hakgudoKoreaSchoolMap20250325.csv')[['학교ID', '학교명', '학교급구분', '설립형태', '운영상태', '위도','경도']]

In [None]:
# 중학교 + 고등학교 학구도 폴리곤 concat
print(len(high_hakgudo))
print(len(middle_hakgudo))
hakgudo_df = pd.concat([high_hakgudo[['HAKGUDO_ID', 'HAKGUDO_NM', 'geometry']], 
           middle_hakgudo[['HAKGUDO_ID', 'HAKGUDO_NM', 'geometry']]])
print(len(hakgudo_df))

hakgudo_gdf = gpd.GeoDataFrame(hakgudo_df,
                               geometry = hakgudo_df.geometry)
hakgudo_gdf= hakgudo_gdf.to_crs('epsg:5179')
hakgudo_gdf.head()

# 2. NON-공립 buffer 설정
- 800 미터 반경으로 생성

In [None]:
school_gdf = gpd.GeoDataFrame(school_map, 
                              geometry=gpd.points_from_xy(school_map.경도, school_map.위도),
                              crs='epsg:4326')
school_gdf = school_gdf.to_crs('epsg:5179')
school_gdf = school_gdf.query("학교급구분 != '초등학교'")
school_gdf.head()

In [None]:
school_gdf.설립형태.value_counts()

In [None]:
non_private = school_gdf.query("설립형태 != '사립'")
private_buffer = school_gdf.query("설립형태 == '사립'")

private_buffer['geometry'] = private_buffer.buffer(800)
private_buffer = gpd.GeoDataFrame(private_buffer, geometry = private_buffer['geometry'])
private_buffer.head(3)

In [None]:
# private_buffer.to_file('private_test.geojson')

# 3. 학구도 프로세스 정의

In [None]:
# 좌표 데이터
station_coords = pd.read_csv('import_data/TB_KTS_STTN/202412/TB_KTS_STTN_20241231.csv')
station_gdf = gpd.GeoDataFrame(station_coords,
                                  geometry = gpd.points_from_xy(station_coords.정류장GPSX좌표,
                                                                station_coords.정류장GPSY좌표),
                                  crs='epsg:4326')
station_gdf = station_gdf.to_crs('epsg:5179')[['정류장ID', '지역코드', '교통수단구분', '정류장명칭', 'geometry']]
station_gdf.head()

In [None]:
from shapely.geometry import Point
# 1. 학교 클러스터 중심점 -> 사립 반경 800 미터 속할 시

# output: 없는 경우 컬럼 태그값 추가 is_hakgudo => True, False

# Create dummy_table 
dummy_card_data = pd.DataFrame({'가상카드번호':['2134', '2134', '1234', '1234', '1234'],
                                '정류장ID':['A', 'B', 'A', 'B', 'C'],
                                '지역코드': ['ab123', 'ab123', 'ab123', 'ab123', 'ab123'],
                                '교통수단구분': ['T']*5,
                                '클러스터구분': ['home', 'home', 'home', 'school', 'school'],
                               'x_5179': [111100.444, 102050.504, 104433.530, 140429.402, 103055.302],
                               'y_5179': [11110.444, 10200.504, 10443.530, 14042.402, 10305.302]})
# 1) Centroid of the school cluster -> is in the buffer or No? 

grouped = dummy_card_data.groupby(['가상카드번호','클러스터구분'])
centroids_df = grouped[['x_5179', 'y_5179']].mean().reset_index()
centroids_df['geometry'] = centroids_df.apply(lambda row: Point(row['x_5179'], row['y_5179']), axis=1)
centroids_gdf = gpd.GeoDataFrame(centroids_df,
                                 geometry = 'geometry',
                                 crs = 'epsg:5179')
centroids_gdf.head()

In [None]:
# is Private?
school_centroids = centroids_gdf.query("클러스터구분 == 'school'")
school_private_match = gpd.sjoin(school_centroids, private_buffer, predicate='within', how='left')
school_private_match['is_hakgudo'] = school_private_match['index_right'].notnull().map({True:'사립', False:None})
school_private_match

In [None]:
# Is it within residence - school in single Hakgudo? 
home_centroids = centroids_gdf.query("클러스터구분 == 'home'")



# 4. Time-windowed table

In [None]:
con.execute("select count(*) from teenager.tb_cardid_dbscan_clustered").df()

In [None]:
query = '''
WITH dbscan_clustered AS(
    SELECT *
    FROM teenager.tb_cardid_dbscan_clustered
),
-- 정류장 결합
stations AS (
    SELECT 지역코드, 
        교통수단구분, 
        정류장ID,
        x, 
        y 
    FROM tb_station_fixed_20241231
    WHERE x IS NOT NULL 
        AND y IS NOT NULL
)
SELECT 
    dc.가상카드번호,
    dc.클러스터구분,
    AVG(s.x) AS x_centroid,
    AVG(s.y) AS y_centroid
FROM dbscan_clustered dc
LEFT JOIN stations s
    ON dc.정류장ID::BIGINT = s.정류장ID::BIGINT
    AND dc.지역코드 = s.지역코드
    AND dc.교통수단구분 = s.교통수단구분
GROUP BY dc.가상카드번호, dc.클러스터구분
'''
home_school_centroid = con.execute(query).df()
home_school_centroid

In [None]:
(2326402 -2201122)/2326402*100

In [None]:
print(home_school_centroid.가상카드번호.nunique())
print(h_s_clustered.가상카드번호.nunique())

In [None]:
# H, S 다 있는 인원들만 남기기
# 2201122 -> 1,100,561명
h_s_clustered = home_school_centroid[home_school_centroid['가상카드번호'].map(home_school_centroid.groupby('가상카드번호').size()==2)]
len(h_s_clustered)

In [None]:
# GeoDataFrame
home_school_centroid_gdf = gpd.GeoDataFrame(h_s_clustered, 
                                        geometry = gpd.points_from_xy(h_s_clustered.x_centroid,
                                                                      h_s_clustered.y_centroid),
                                        crs='epsg:4326').to_crs('epsg:5179')[['가상카드번호', '클러스터구분', 'geometry']]
home_school_centroid_gdf

In [None]:
# 1. 학교 클러스터가 사립에 속한지 확인
school_centroid_gdf = home_school_centroid_gdf.query("클러스터구분 == 'school'")
school_private_mapped = school_centroid_gdf.sjoin(private_buffer, how='left', predicate='within')
print(f"전체 학교 센트로이드: {len(school_private_mapped)}")
print(f"국공립 예상: {school_private_mapped.index_right.isnull().sum()}")

# 컬럼 매핑 
school_private_mapped['is_hakgudo_private'] = school_private_mapped['index_right'].notnull()
school_private_mapped = school_private_mapped[['가상카드번호', '클러스터구분', 'geometry', 'is_hakgudo_private']]
school_private_mapped.head()

In [None]:
# 학군 비교
home_centroid_gdf = home_school_centroid_gdf.query("클러스터구분 == 'home'")

school_joined = gpd.sjoin(school_centroid_gdf, hakgudo_gdf, how='left', predicate='within')
home_joined = gpd.sjoin(home_centroid_gdf, hakgudo_gdf, how='left', predicate='within')

school_joined = school_joined[['가상카드번호', 'HAKGUDO_ID']].rename(columns={'HAKGUDO_ID':'school_hakgudo_id'})
home_joined = home_joined[['가상카드번호', 'HAKGUDO_ID']].rename(columns={'HAKGUDO_ID':'home_hakgudo_id'})

# Merge and compare
home_school_joined_merge = pd.merge(school_joined, home_joined, on ='가상카드번호', how='outer')
home_school_joined_merge['is_hakgudo'] = home_school_joined_merge.apply(lambda row: True 
                                                                        if row['school_hakgudo_id'] == row['home_hakgudo_id'] 
                                                                        else False, axis=1)
home_school_joined_merge.head()

In [None]:
final_df = pd.merge(home_school_joined_merge, school_private_mapped, on='가상카드번호', how='left')

final_df['final_is_hakgudo'] = final_df['is_hakgudo_private'].combine_first(final_df['is_hakgudo'])
final_df.final_is_hakgudo.value_counts()

In [None]:
1773300/(1773300+5187975)*100

In [None]:
# 다시 고유값으로 정리
final_df