In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import duckdb
import geopandas as gpd
import warnings
warnings.filterwarnings('ignore')

# pandas setting
pd.set_option('display.float_format', '{:.2f}'.format)
# matplotlib setting
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
import os
size = os.path.getsize('myanalysis.db')
size/1024/1024/1024

In [None]:
# Connect to Duckdb
db = duckdb.connect(database = 'myanalysis.db', read_only=False)
db.execute('show tables').df()

In [None]:
# 1. 2024년 7월 6일 데이터 목적통행으로 불러오기

In [None]:
# 목적 통행 1일차 들고오기
query = '''
WITH transport_0706 AS(
    SELECT
        운행일자,
        가상카드번호,
        정산지역코드,
        승차일시,
        정산사승차정류장ID,
        하차일시,
        정산사하차정류장ID,
        트랜잭션ID,
        환승건수,
        이용자수,
        이용거리,
        탑승시간,
        CASE WHEN 교통수단코드 BETWEEN 200 AND 299 THEN 'T' ELSE 'B' END AS 교통수단구분

    FROM read_csv_auto('import_data/TB_KTS_DWTCD_METROPOLITAN/202407/TB_KTS_DWTCD_METROPOLITAN_20240706.csv')
),
temp_0706 AS(
SELECT
        가상카드번호,
        트랜잭션ID,
        MAX(환승건수) AS 최대환승수,
        SUM(이용거리) AS 총이동거리,
        SUM(탑승시간) AS 총탑승시간
    FROM transport_0706
    GROUP BY 가상카드번호, 트랜잭션ID
)
SELECT
        t.가상카드번호,
        t.트랜잭션ID,
        t1.정산사승차정류장ID AS 승차정류장ID,
        t2.정산사하차정류장ID AS 하차정류장ID,
        t1.승차일시 AS 승차일시,
        t2.하차일시 AS 하차일시,
        t1.정산지역코드 AS 승차지역코드,
        t2.정산지역코드 AS 하차지역코드,
        t1.교통수단구분 AS 승차교통수단구분,
        t2.교통수단구분 AS 하차교통수단구분,
        t.총이동거리,
        t.총탑승시간

FROM temp_0706 as t
    -- t1: 첫 통행
    LEFT JOIN transport_0706 t1
        ON t.가상카드번호 = t1.가상카드번호
        AND t.트랜잭션ID = t1.트랜잭션ID
        AND t1.환승건수 = 0
    -- t2: 마지막 통행
    LEFT JOIN transport_0706 t2
        ON t.가상카드번호 = t2.가상카드번호
        AND t.트랜잭션ID = t2.트랜잭션ID
        AND t.최대환승수 = t2.환승건수

'''
df = db.execute(query).df()

In [None]:
# 원본 테이블 크기 확인 13267940 -> 10557556 ( 79.6%)
# print(db.execute('''select count(*)
# from read_csv_auto('import_data/TB_KTS_DWTCD_METROPOLITAN/202407/TB_KTS_DWTCD_METROPOLITAN_20240706.csv')
# ''').df())

print(len(df))
# 결측치 확인
'''
승차정류장ID 0.15%
하차정류장ID 2.23%
승차일시 0.15%
하차일시 2.23%
승차지역코드 0.15%
승차교통수단구분 0.15%
총이동거리 2.53%
'''

df.isnull().sum()

In [None]:
# 2. 인버스 테이블 뽑기

In [None]:
transport_df = pd.read_csv('import_data/TB_KTS_DWTCD_METROPOLITAN/202407/TB_KTS_DWTCD_METROPOLITAN_20240706.csv')

In [None]:
# Change the datatype into datetime
df.승차일시 = pd.to_datetime(df.승차일시, format='%Y%m%d%H%M%S')
df.하차일시 = pd.to_datetime(df.하차일시, format='%Y%m%d%H%M%S')

# Change the datatype of stationID
df.승차정류장ID = df.승차정류장ID.astype('Int64')
df.하차정류장ID = df.하차정류장ID.astype('Int64')
df.head()

In [None]:
# sorting the dataframe in order of transaction
df.sort_values(['가상카드번호', '승차일시'], inplace=True)
print(len(df)) # 10557556건
df.dropna(subset=['승차일시','하차일시'],inplace=True) # 하차일시 Null값인 건 제외
print(len(df)) # 10306857건(-235350건, 2.4%)

# data accuracy  승차일시 > 하차일시
df = df[~(df.승차일시 > df.하차일시)]
print(len(df)) # 10306385건(-472건, 0.0%)

duration_df = df.copy()
duration_df['이전하차일시'] = duration_df.groupby('가상카드번호')['하차일시'].shift(1)
duration_df['이전하차정류장ID'] = duration_df.groupby('가상카드번호')['하차정류장ID'].shift(1)
duration_df['이전하차지역코드'] = duration_df.groupby('가상카드번호')['하차지역코드'].shift(1)
duration_df['이전하차교통수단구분'] = duration_df.groupby('가상카드번호')['하차교통수단구분'].shift(1)

# 체류시간
duration_df['체류시간'] = duration_df['승차일시'] - duration_df['이전하차일시']

# drop NULLs
duration_df = duration_df.dropna(subset='체류시간')
print(len(duration_df))
duration_df['정류장별체류시간'] = duration_df['체류시간']/2

# to time delta
duration_df['정류장별체류시간(분)'] = pd.to_timedelta(duration_df.정류장별체류시간).dt.total_seconds()/60

duration_df.head()

In [None]:
# duration_time table

duration_board = duration_df[['승차정류장ID', '승차지역코드', '승차교통수단구분', '정류장별체류시간(분)']].rename(columns = {
    '승차정류장ID':'정류장ID',
    '승차지역코드':'지역코드',
    '승차교통수단구분':'교통수단구분'})
duration_alight = duration_df[['이전하차정류장ID', '이전하차지역코드', '이전하차교통수단구분', '정류장별체류시간(분)']].rename(columns = {
    '이전하차정류장ID':'정류장ID',
    '이전하차지역코드':'지역코드',
    '이전하차교통수단구분':'교통수단구분'})

duration_total = pd.concat([duration_board, duration_alight], axis=0)
duration_total['정류장별체류인원'] = 1
duration_total.head()

In [None]:
(9483230-9115376)

In [None]:
# checking the minus value

print(len(duration_total[duration_total['정류장별체류시간(분)']<0])) # 24442(0.3%)
print(len(duration_total))
duration_positive = duration_total[duration_total['정류장별체류시간(분)'] >5] # 5분 이상인 경우만 체크

print(len(duration_positive))

In [None]:
# Descriptive statistics of duration time

print(duration_positive['정류장별체류시간(분)'].describe())
duration_positive['정류장별체류시간(분)'].plot.box()

In [None]:
# GroupBy Aggregating into staionID
duration_df = duration_positive.groupby(['정류장ID', '지역코드', '교통수단구분']).agg(duration_count = ('정류장별체류인원', 'sum'),
                                                                      정류장별총체류시간 = ('정류장별체류시간(분)', 'sum'),
                                                                      정류장별평균체류시간 = ('정류장별체류시간(분)', 'mean'),
                                                                      정류장별중위체류시간 = ('정류장별체류시간(분)', 'median'))
# total
duration_df['duration_total_min'] = round(duration_df['정류장별총체류시간'], 2)

# mean
duration_df['duration_mean_min'] = round(duration_df['정류장별평균체류시간'], 2)

# median
duration_df['duration_median_min'] = round(duration_df['정류장별중위체류시간'], 2)

duration_df = duration_df.reset_index().drop(columns=['정류장별총체류시간', '정류장별평균체류시간', '정류장별중위체류시간'])
duration_df

In [None]:
# 3. 정류장 정보 붙이기

In [None]:
duration_df.정류장ID = duration_df.정류장ID.astype('Int64')

In [None]:
48894/50522*100

In [None]:
station = pd.read_csv('import_data/TB_KTS_STTN/202407/TB_KTS_STTN_20240706.csv')
station.columns
duration_station_df = duration_df.merge(station, how ='left', on = ['정류장ID', '지역코드', '교통수단구분'], indicator=True).drop(columns =
                                                                               ['운행일자', '정산사코드', '정류장ARS번호'])
print(duration_station_df._merge.value_counts())
duration_station_df.drop(columns = '_merge', inplace=True)
duration_station_df

In [None]:
# to 5179
duration_station_gdf = gpd.GeoDataFrame(duration_station_df,
                                        geometry = gpd.points_from_xy(duration_station_df['정류장GPSX좌표'],
                                                                      duration_station_df['정류장GPSY좌표']),
                                        crs='EPSG:4326')
duration_station_gdf = duration_station_gdf.to_crs('EPSG:5179')
duration_station_gdf['x_5179'] = duration_station_gdf.geometry.x
duration_station_gdf['y_5179'] = duration_station_gdf.geometry.y
duration_station_gdf.drop(columns = 'geometry', inplace=True)
# export to csv
duration_station_gdf.to_csv('_output/240706_체류시간_위경도포함.csv')
# duration_station_df.drop(columns = ['정류장GPSY좌표', '정류장GPSX좌표']).to_csv('_output/240706_체류시간_위경도제거.csv')

In [None]:
duration_station_gdf

In [None]:
duration_station_gdf.describe()