In [7]:
import warnings
warnings.filterwarnings(action='ignore') 

import datetime
import pandas as pd
import numpy as np
import csv
import folium
from folium.plugins import HeatMap 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans    ##  K-means 임포트
from sklearn.metrics import silhouette_score
import scipy as sp
from scipy.cluster.hierarchy import dendrogram, linkage
import statsmodels.formula.api as smf
from dateutil.relativedelta import relativedelta
# import setuptools.dist
from yellowbrick.cluster import KElbowVisualizer
from mpl_toolkits.mplot3d import Axes3D
plt.rc('font',family='D2CodingLigature Nerd Font')
# plt.rcParams['axes.unicode_minus']=False  # '- 표시

## 데이터 불러오기

In [8]:
# 승하차 인원 CSV 파일 읽어오기
data = pd.read_csv("../../data/seoul-metro-2021.logs.csv")

data

Unnamed: 0,timestamp,station_code,people_in,people_out
0,2021-01-01T05:00:00.000+09:00,150,86,85
1,2021-01-01T06:00:00.000+09:00,150,111,355
2,2021-01-01T07:00:00.000+09:00,150,157,438
3,2021-01-01T08:00:00.000+09:00,150,306,592
4,2021-01-01T09:00:00.000+09:00,150,333,841
...,...,...,...,...
1941548,2021-12-31T19:00:00.000+09:00,2828,74,263
1941549,2021-12-31T20:00:00.000+09:00,2828,73,145
1941550,2021-12-31T21:00:00.000+09:00,2828,95,209
1941551,2021-12-31T22:00:00.000+09:00,2828,54,138


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941553 entries, 0 to 1941552
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   timestamp     object
 1   station_code  int64 
 2   people_in     int64 
 3   people_out    int64 
dtypes: int64(3), object(1)
memory usage: 59.3+ MB


In [10]:
# 지하철 역 정보 CSV 파일 읽어오기
station_info = pd.read_csv("../../data/seoul-metro-station-info.csv")
station_info.head(2)

Unnamed: 0,station.code,station.fr_code,line.num,line.name,line.name_sub,line.station_seq,station.name_full,station.name,station.name_chc,station.name_chn,station.name_en,station.name_jp,geo.latitude,geo.longitude,geo.sigungu_code,geo.sigungu_name,geo.addres_road,geo.address_land,geo.phone
0,158,124,1,1호선,지하철1호선,1,청량리(서울시립대입구),청량리|서울시립대입구,祭基洞,祭基洞,Jegidong,チェギドン,37.580178,127.046835,11060,동대문구,서울특별시 동대문구 왕산로 지하205(전농동),서울특별시 동대문구 전농동 620-69 청량리역(1호선),02-6110-1241
1,157,125,1,1호선,지하철1호선,2,제기동,제기동,新設洞,新设洞,Sinseoldong,シンソルトン,37.578103,127.034893,11060,동대문구,서울특별시 동대문구 왕산로 지하93(제기동),서울특별시 동대문구 제기동 65 제기동역(1호선),02-6110-1251


In [11]:
station_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   station.code       285 non-null    int64  
 1   station.fr_code    285 non-null    object 
 2   line.num           285 non-null    int64  
 3   line.name          285 non-null    object 
 4   line.name_sub      285 non-null    object 
 5   line.station_seq   285 non-null    int64  
 6   station.name_full  285 non-null    object 
 7   station.name       285 non-null    object 
 8   station.name_chc   285 non-null    object 
 9   station.name_chn   285 non-null    object 
 10  station.name_en    285 non-null    object 
 11  station.name_jp    285 non-null    object 
 12  geo.latitude       285 non-null    float64
 13  geo.longitude      285 non-null    float64
 14  geo.sigungu_code   285 non-null    int64  
 15  geo.sigungu_name   285 non-null    object 
 16  geo.addres_road    285 non

In [12]:
# 승하차 인원을 역코드별 그룹화하여 승하차 인원 수 각각 합
station_sum = data.groupby("station_code").sum()
station_sum

Unnamed: 0_level_0,timestamp,people_in,people_out
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
150,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,12697273,12109991
151,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,5997344,6030491
152,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,9638952,9284693
153,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,7704599,7090896
154,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,6651283,6609055
...,...,...,...
2824,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,3206211,3097722
2825,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1395919,1495603
2826,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1614590,1475010
2827,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1444073,1147602


In [13]:
station_info = station_info[['station.code','station.name','geo.latitude', 'geo.longitude']]
station_info

Unnamed: 0,station.code,station.name,geo.latitude,geo.longitude
0,158,청량리|서울시립대입구,37.580178,127.046835
1,157,제기동,37.578103,127.034893
2,156,신설동,37.575297,127.025087
3,159,동묘앞,37.572627,127.016429
4,155,동대문,37.571420,127.009745
...,...,...,...,...
280,2823,남한산성입구|성남법원|검찰청,37.451535,127.159816
281,2824,단대오거리,37.445210,127.156866
282,2825,신흥,37.440918,127.147564
283,2826,수진,37.437428,127.140722


In [14]:
# 인덱스 변경
station_info = station_info.set_index('station.code')

# 데이터 확인하기
station_info

Unnamed: 0_level_0,station.name,geo.latitude,geo.longitude
station.code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
158,청량리|서울시립대입구,37.580178,127.046835
157,제기동,37.578103,127.034893
156,신설동,37.575297,127.025087
159,동묘앞,37.572627,127.016429
155,동대문,37.571420,127.009745
...,...,...,...
2823,남한산성입구|성남법원|검찰청,37.451535,127.159816
2824,단대오거리,37.445210,127.156866
2825,신흥,37.440918,127.147564
2826,수진,37.437428,127.140722


In [15]:
# 데이터 합치기
joined_data = station_sum.join(station_info)

# 데이터 확인하기
joined_data

## how = 'left'가 기본, right, outer(양쪽 인덱스 모두 _합집합), inner(양쪽에 공통으로 들어있는 인덱스_교집합)

Unnamed: 0_level_0,timestamp,people_in,people_out,station.name,geo.latitude,geo.longitude
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
150,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,12697273,12109991,서울역,37.554648,126.972559
151,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,5997344,6030491,시청,37.564718,126.977108
152,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,9638952,9284693,종각,37.570161,126.982923
153,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,7704599,7090896,종로3가|탑골공원,37.571607,126.991806
154,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,6651283,6609055,종로5가,37.570926,127.001849
...,...,...,...,...,...,...
2824,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,3206211,3097722,단대오거리,37.445210,127.156866
2825,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1395919,1495603,신흥,37.440918,127.147564
2826,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1614590,1475010,수진,37.437428,127.140722
2827,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1444073,1147602,모란,37.432130,127.129087


In [16]:
# 기준 지도
seoul_in = folium.Map(location=[37.56678, 126.9782], zoom_start = 12)   # 서울시청
seoul_in

In [17]:
HeatMap(data = joined_data[['geo.latitude', 'geo.longitude', 'people_in']]).add_to(seoul_in)
seoul_in

In [18]:
## 하차인원 지도
seoul_out = folium.Map(location=[37.56678, 126.9782], zoom_start = 12)

# 히트맵 지도에 추가하기
HeatMap(data = joined_data[['geo.latitude', 'geo.longitude', 'people_out']]).add_to(seoul_out)

# 역 이름을 마커로 추가
for idx, row in joined_data.iterrows():
    folium.Marker(
        location=[row['geo.latitude'], row['geo.longitude']],
        popup=f"{idx}: {row['people_out']}명",  # 팝업으로 역 코드와 하차 인원 표시
        # icon=folium.Icon(color='red', icon='info', prefix='fa'),
        tooltip=row['station.name'],  # 툴팁으로 역 이름만 표시
     ).add_to(seoul_out)
    
seoul_out

## 시간대를 고려하여 데이터 가공 후 지도 표시

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941553 entries, 0 to 1941552
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   timestamp     object
 1   station_code  int64 
 2   people_in     int64 
 3   people_out    int64 
dtypes: int64(3), object(1)
memory usage: 59.3+ MB


In [20]:
data.head(3)

Unnamed: 0,timestamp,station_code,people_in,people_out
0,2021-01-01T05:00:00.000+09:00,150,86,85
1,2021-01-01T06:00:00.000+09:00,150,111,355
2,2021-01-01T07:00:00.000+09:00,150,157,438


In [21]:
# 출근 시간대 데이터 추출
morning_data = data[pd.to_datetime(data.timestamp).dt.hour < 9]
morning_data

Unnamed: 0,timestamp,station_code,people_in,people_out
0,2021-01-01T05:00:00.000+09:00,150,86,85
1,2021-01-01T06:00:00.000+09:00,150,111,355
2,2021-01-01T07:00:00.000+09:00,150,157,438
3,2021-01-01T08:00:00.000+09:00,150,306,592
19,2021-01-01T05:00:00.000+09:00,151,43,40
...,...,...,...,...
1941518,2021-12-31T08:00:00.000+09:00,2827,391,311
1941534,2021-12-31T05:00:00.000+09:00,2828,20,10
1941535,2021-12-31T06:00:00.000+09:00,2828,83,41
1941536,2021-12-31T07:00:00.000+09:00,2828,279,119


In [22]:
# 출근시간대 데이터를 역 기준으로 그룹화하여 sum
morning_station_sum = morning_data.groupby("station_code").sum()
morning_station_sum

Unnamed: 0_level_0,timestamp,people_in,people_out
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
150,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1280861,3455240
151,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,215072,2744871
152,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,221218,3643820
153,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,218777,1222960
154,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,185510,1776080
...,...,...,...
2824,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1132603,457685
2825,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,414540,164925
2826,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,544243,174746
2827,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,282770,180392


In [23]:
# 역 정보를 출근정보와 합치기
morning_joined_data = morning_station_sum.join(station_info)
morning_joined_data

Unnamed: 0_level_0,timestamp,people_in,people_out,station.name,geo.latitude,geo.longitude
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
150,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1280861,3455240,서울역,37.554648,126.972559
151,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,215072,2744871,시청,37.564718,126.977108
152,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,221218,3643820,종각,37.570161,126.982923
153,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,218777,1222960,종로3가|탑골공원,37.571607,126.991806
154,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,185510,1776080,종로5가,37.570926,127.001849
...,...,...,...,...,...,...
2824,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,1132603,457685,단대오거리,37.445210,127.156866
2825,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,414540,164925,신흥,37.440918,127.147564
2826,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,544243,174746,수진,37.437428,127.140722
2827,2021-01-01T05:00:00.000+09:002021-01-01T06:00:...,282770,180392,모란,37.432130,127.129087


In [24]:
# 출근 시간대 승차 인원
morning_seoul_in = folium.Map(location=[37.55, 126.98], zoom_start = 12)
HeatMap(data = morning_joined_data[['geo.latitude', 'geo.longitude', 'people_in']]).add_to(morning_seoul_in)
morning_seoul_in

In [25]:
# 출근 시간대 하차 인원
morning_seoul_out = folium.Map(location=[37.55, 126.98], zoom_start = 12)
HeatMap(data = morning_joined_data[['geo.latitude', 'geo.longitude', 'people_out']]).add_to(morning_seoul_out)
morning_seoul_out

In [26]:
# 퇴근 시간대 데이터 추출
evening_data = data[pd.to_datetime(data['timestamp']).dt.hour >= 17]
evening_data

Unnamed: 0,timestamp,station_code,people_in,people_out
12,2021-01-01T17:00:00.000+09:00,150,839,626
13,2021-01-01T18:00:00.000+09:00,150,658,437
14,2021-01-01T19:00:00.000+09:00,150,579,425
15,2021-01-01T20:00:00.000+09:00,150,479,354
16,2021-01-01T21:00:00.000+09:00,150,510,307
...,...,...,...,...
1941548,2021-12-31T19:00:00.000+09:00,2828,74,263
1941549,2021-12-31T20:00:00.000+09:00,2828,73,145
1941550,2021-12-31T21:00:00.000+09:00,2828,95,209
1941551,2021-12-31T22:00:00.000+09:00,2828,54,138


In [27]:
# 퇴근시간대 데이터를 역 기준으로 그룹화하여 sum
evening_station_sum = evening_data.groupby("station_code").sum()
evening_station_sum

Unnamed: 0_level_0,timestamp,people_in,people_out
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
150,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,6304530,3051702
151,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3968167,610346
152,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,6460197,1175399
153,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3964421,1082916
154,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3254959,804216
...,...,...,...
2824,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,747595,1623770
2825,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,339222,736146
2826,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,334808,720846
2827,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,435555,396286


In [28]:
# 역 정보를 퇴근정보와 합치기
evening_joined_data = evening_station_sum.join(station_info)
evening_joined_data

Unnamed: 0_level_0,timestamp,people_in,people_out,station.name,geo.latitude,geo.longitude
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
150,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,6304530,3051702,서울역,37.554648,126.972559
151,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3968167,610346,시청,37.564718,126.977108
152,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,6460197,1175399,종각,37.570161,126.982923
153,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3964421,1082916,종로3가|탑골공원,37.571607,126.991806
154,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,3254959,804216,종로5가,37.570926,127.001849
...,...,...,...,...,...,...
2824,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,747595,1623770,단대오거리,37.445210,127.156866
2825,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,339222,736146,신흥,37.440918,127.147564
2826,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,334808,720846,수진,37.437428,127.140722
2827,2021-01-01T17:00:00.000+09:002021-01-01T18:00:...,435555,396286,모란,37.432130,127.129087


In [30]:
# 퇴근 시간대 승차 인원
evening_seoul_in = folium.Map(location=[37.55, 126.98], zoom_start = 12)
HeatMap(data = evening_joined_data[['geo.latitude', 'geo.longitude', 'people_in']]).add_to(evening_seoul_in)
evening_seoul_in

In [32]:
# 퇴근 시간대 하차 인원
evening_seoul_out = folium.Map(location=[37.55, 126.98], zoom_start = 12)
HeatMap(data = evening_joined_data[['geo.latitude', 'geo.longitude', 'people_out']],
        gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'yellow', 0.8: 'red'}
       ).add_to(evening_seoul_out)
evening_seoul_out