In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import json
import pathlib
import pickle
import os
current_path = os.getcwd()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('float_format', '{:f}'.format)

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

from datetime import datetime, timedelta
import statistics
import time

import re

import pymysql  
from haversine import haversine

from geoband.API import *
import geopandas as gpd
import folium
from folium.plugins import FastMarkerCluster, MarkerCluster
import geoplot as gplt
import geoplot.crs as gcrs
import imageio
import mapclassify as mc

import random
from functools import reduce
from collections import defaultdict

from IPython.display import display
from tqdm.notebook import tqdm
from tqdm import tqdm, tqdm_notebook

import sklearn.cluster
import tensorflow as tf
import pydeck as pdk
import cufflinks as cf 
cf.go_offline(connected=True)
cf.set_config_file(theme='polar')

import shapely
from shapely import wkt
from shapely.geometry import Polygon, Point, shape

# 최적화 solver
from mip import Model, xsum, maximize, BINARY  

# font 
import matplotlib.font_manager as font_manager
path = current_path+'/NanumBarunGothic.ttf'
fontprop = font_manager.FontProperties(fname=path)

font_dirs = [current_path, ]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)
plt.rcParams["font.family"] = 'NanumGothic'
mpl.rcParams['font.family'] = 'NanumBarunGothic'

Using Python-MIP package version 1.5.3


In [2]:
# Data Load 
df_01 = pd.read_csv(current_path+'/input/1.수원시_버스정류장.csv')
df_02 = pd.read_csv(current_path+'/input/2.수원시_버스정류장별_승하차이력(1).csv')
df_03 = pd.read_csv(current_path+'/input/3.수원시_버스정류장별_승하차이력(2).csv')
df_04 = pd.read_csv(current_path+'/input/4.수원시_버스정류장별_승하차이력(3).csv')
df_05 = pd.read_csv(current_path+'/input/5.수원시_버스정류장별_승하차이력(4).csv')
df_06 = pd.read_csv(current_path+'/input/6.수원시_버스정류장별_승하차이력(5).csv')
df_07 = pd.read_csv(current_path+'/input/7.수원시_버스정류장별_노선현황.csv')
df_08 = pd.read_csv(current_path+'/input/8.수원시_지하철역_위치정보.csv')
df_09 = pd.read_csv(current_path+'/input/9.수원시_지하철역별_이용현황(2017~2019).csv')
df_10 = pd.read_csv(current_path+'/input/10.수원시_옥외광고물현황.csv')
df_11 = pd.read_csv(current_path+'/input/11.수원시_대기오염도_측정현황.csv')
df_12 = pd.read_csv(current_path+'/input/12.수원시_주차장현황.csv')
df_13 = pd.read_csv(current_path+'/input/13.수원시_기상데이터(2020).csv')
df_14 = pd.read_csv(current_path+'/input/14.수원시_시간대별_유동인구(2020).csv')
df_15 = pd.read_csv(current_path+'/input/15.수원시_성연령별_유동인구(2020).csv')
df_16 = pd.read_csv(current_path+'/input/16.수원시_요일별_유동인구(2020).csv')
df_17 = gpd.read_file(current_path+'/input/17.수원시_인구정보(고령)_격자.geojson')
df_18 = gpd.read_file(current_path+'/input/18.수원시_인구정보(생산가능)_격자.geojson')
df_19 = gpd.read_file(current_path+'/input/19.수원시_인구정보(유소년)_격자.geojson')
df_20 = gpd.read_file(current_path+'/input/20.수원시_교통노드.geojson')
df_21 = gpd.read_file(current_path+'/input/21.수원시_교통링크.geojson')
df_22 = gpd.read_file(current_path+'/input/22.수원시_상세도로망_LV6.geojson')
df_23 = pd.read_csv(current_path+'/input/23.수원시_평일_일별_시간대별_추정교통량_LV6.csv')
df_24 = pd.read_csv(current_path+'/input/24.수원시_평일_일별_혼잡빈도강도_LV6.csv')
df_25 = pd.read_csv(current_path+'/input/25.수원시_평일_일별_혼잡시간강도_LV6.csv')
df_26 = gpd.read_file(current_path+'/input/26.수원시_인도(2017).geojson')
df_27 = gpd.read_file(current_path+'/input/27.수원시_도로명주소(건물).geojson')
df_28 = gpd.read_file(current_path+'/input/28.수원시_건물연면적_격자.geojson')
df_29 = gpd.read_file(current_path+'/input/29.수원시_법정경계(시군구).geojson')
df_30 = gpd.read_file(current_path+'/input/30.수원시_법정경계(읍면동).geojson')
df_31 = gpd.read_file(current_path+'/input/31.수원시_행정경계(읍면동).geojson')
df_32 = gpd.read_file(current_path+'/input/32.수원시_지적도.geojson')

In [3]:
# pydeck function 
def line_string_to_coordinates(line_string): 
    if isinstance(line_string, shapely.geometry.linestring.LineString): 
        lon, lat = line_string.xy 
        return [[x, y] for x, y in zip(lon, lat)] 
    elif isinstance(line_string, shapely.geometry.multilinestring.MultiLineString): 
        ret = [] 
        for i in range(len(line_string)): 
            lon, lat = line_string[i].xy 
            for x, y in zip(lon, lat): 
                ret.append([x, y])
        return ret 

def multipolygon_to_coordinates(x): 
    lon, lat = x[0].exterior.xy 
    return [[x, y] for x, y in zip(lon, lat)] 

def polygon_to_coordinates(x): 
    lon, lat = x.exterior.xy 
    return [[x, y] for x, y in zip(lon, lat)] 
 
def multipolygon_to_center_coordinates(x): 
    lon, lat = x[0].centroid.xy 
    return [[x, y] for x, y in zip(lon, lat)]

def polygon_to_center_coordinates(x): 
    lon, lat = x.centroid.xy
    return [[x, y] for x, y in zip(lon, lat)] 

In [4]:
token = "pk.eyJ1IjoiZGx3b3FsczQzMjMiLCJhIjoiY2tscnR3bG95MDJwaDJ2bjUzcTBrc3h4cyJ9.WigDFX0Gm612haaz4zQ2hg"

# 1. Bus Data  
* df_01 + df_07 
* df_02 ~ df_06 
---
* **경기 버스 정보** : http://www.gbis.go.kr/
* GGD_RouteInfo_M.xls : 버스 노선 정보 (기점, 종점 / 주중배차간격, 주말배차간격 / 첫차, 막차 시간) 
* GGD_RouteStationInfo_M.xls : 노선 경유 정보 (버스 노선 순서) 

## 1-1. tmp : df_01 + df_07 
* 버스 정보 합치기 
* 전처리 

In [5]:
# BIS = 1 & 인도 폭 0 or 4 이상인 정류장만 분석에 이용 
sidewalk = gpd.read_file(current_path+'/data/bus_filter_final.geojson')
sidewalk = sidewalk.drop(["layer", "path", "geometry"], axis=1)
sidewalk = sidewalk.drop_duplicates(["정류장ID"])

print(sidewalk.shape)
sidewalk.head()

(519, 14)


Unnamed: 0,정류장ID,정류장명,정류장유형,환승역타입,위치(동),쉘터,BIS설치여부,LED,LCD,LED+LCD복합형,알뜰형,임대형(음성),lon,lat
0,201000345,남수원초등학교,시내,일반,경기도 수원시 세류2동,,1,,,,1.0,,127.016,37.251091
1,201000268,곡반중학교,시내,일반,경기도 수원시 곡선동,1.0,1,1.0,,,,,127.034254,37.246081
2,201000357,수원은혜교회.한양수자인파크원아파트,시내,일반,경기도 수원시 금곡동,1.0,1,,1.0,,,,126.953978,37.266016
3,202000090,화서역,"시내,마을",일반,경기도 수원시 화서2동,1.0,1,,,1.0,,,126.990107,37.283895
4,202000092,월드메르디앙,시내,일반,경기도 수원시 우만2동,,1,,,1.0,,,127.037045,37.280164


In [6]:
tmp = sidewalk.merge(df_07[['정류소ID', '운행노선', '중앙차로여부']], how = 'left', left_on = '정류장ID', right_on = '정류소ID', copy = False)
del tmp['정류소ID']

## 1-2. tmp data preprocessing 

In [7]:
tmp.환승역타입 = '일반'
tmp.쉘터 = tmp.쉘터.fillna(0)

# LED / LCD / LED+LCD 
tmp.LED = tmp.LED.fillna(0)
tmp.LCD = tmp.LCD.fillna(0)
tmp['LED+LCD복합형'] = tmp['LED+LCD복합형'].fillna(0)

# 알뜰형 / 임대형(음성) 
tmp.알뜰형 = tmp.알뜰형.fillna(0)
tmp['임대형(음성)'] = tmp['임대형(음성)'].fillna(0)

# 운행노선 & 중앙차로여부 없는 데이터 drop 
tmp = tmp.dropna(axis=0)

## 1-3. df_02 ~ df_06 

In [8]:
df_02_06 = pd.concat([df_02, df_03, df_04, df_05, df_06], axis=0, ignore_index = True)
df_02_06.shape

(2508607, 12)

In [9]:
# 노선번호 object 형태로 맞춰주기 
tqdm.pandas()
df_02_06["노선번호"] = df_02_06["노선번호"].progress_apply(lambda x : str(x))

# '92' 처럼 용남고속 & 수원여객 동시운행 버스 통합  
df_02_06 = df_02_06.groupby(by=['일자', '정류소ID', '노선번호', '노선유형']).sum([['전체승차건수','초승건수','환승건수','전체하차건수','하차건수','미태그건수']]).reset_index()

100%|██████████| 2508607/2508607 [00:03<00:00, 721791.57it/s]


In [10]:
# 기존 버스 데이터에 합치기 
bus = df_02_06.merge(tmp, how='left', left_on = '정류소ID', right_on = '정류장ID', copy = True).drop(['정류장ID'], axis=1)

# BIS 미설치 정류장 & 인도 폭 기준 미달 정류장 제외 
bus = bus.dropna(axis=0)

In [11]:
print("df_01 unique 정류장 ID 개수 : ", df_01["정류장ID"].nunique())
print("df_02~06 unique 정류장 ID 개수 : ", df_02_06["정류소ID"].nunique())

print("=====" * 20)
print("합치기 전 unique 정류장 ID 개수 : ", tmp["정류장ID"].nunique())
print("합친 후의 unique 정류장 ID 개수 : ", bus["정류소ID"].nunique())

df_01 unique 정류장 ID 개수 :  1179
df_02~06 unique 정류장 ID 개수 :  1114
합치기 전 unique 정류장 ID 개수 :  518
합친 후의 unique 정류장 ID 개수 :  516


## 1-4. 외부데이터 

In [12]:
route = pd.read_excel(current_path+'/data/GGD_RouteInfo_M.xls')
trans = pd.read_excel(current_path+'/data/GGD_RouteStationInfo_M.xls', sheet_name=[0,1,2])

trans = pd.concat([trans[0], trans[1], trans[2]], axis=0, ignore_index = True)

### (1) 버스 노선 순서 데이터 

In [13]:
# 버스 노선 순서 데이터 
trans2 = trans.rename(columns = {"STATION_ID": "정류소ID", "ROUTE_ID": "노선ID", "ROUTE_NM": "노선번호", "STA_ORDER": "노선순서", "STATION_NM": "정류장명"})
trans2 = trans2.drop(["순번", "MOBILE_NO"], axis=1)

# 노선 번호 전처리 
trans2["노선번호"] = trans2["노선번호"].progress_apply(lambda x : str(x))
trans2["노선번호"] = trans2["노선번호"].progress_apply(lambda x : x.replace("-", "_"))

100%|██████████| 192708/192708 [00:00<00:00, 633928.98it/s]
100%|██████████| 192708/192708 [00:00<00:00, 584172.69it/s]


In [14]:
# Data Merge 
tmp2 = bus.merge(trans2.drop(["정류장명"], axis=1), how='left', on=['정류소ID', '노선번호'])
print("bus+trans shape : ", tmp2.shape , '\n') 
print("원래 데이터의 unique bus 개수 : ", bus.정류소ID.nunique())
print("합친 데이터의 unique bus 개수 : ", tmp2.정류소ID.nunique())
print("=====" * 20)
tmp2.head(2)

bus+trans shape :  (1364613, 27) 

원래 데이터의 unique bus 개수 :  516
합친 데이터의 unique bus 개수 :  516


Unnamed: 0,일자,정류소ID,노선번호,노선유형,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,정류장명,정류장유형,환승역타입,위치(동),쉘터,BIS설치여부,LED,LCD,LED+LCD복합형,알뜰형,임대형(음성),lon,lat,운행노선,중앙차로여부,노선ID,노선순서
0,20200101,200000006,13,일반형시내버스,166,160,6,67,67,0,광교공원.경기대수원캠퍼스입구.연무시장,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"13(수원여객),16(수원여객),16-1(수원여객),16-2(수원여객),32-5(수...",노변정류장,200000037.0,13.0
1,20200101,200000006,37,일반형시내버스,36,33,3,0,0,0,광교공원.경기대수원캠퍼스입구.연무시장,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"13(수원여객),16(수원여객),16-1(수원여객),16-2(수원여객),32-5(수...",노변정류장,200000099.0,1.0


### (2) 버스 노선 정보 데이터 

In [15]:
route["노선번호"] = route["노선번호"].progress_apply(lambda x : str(x))
route["노선번호"] = route["노선번호"].progress_apply(lambda x : x.replace("-", "_"))

route = route[route["하행첫차"] != '1899/1']
print(route.shape)

100%|██████████| 3480/3480 [00:00<00:00, 564299.77it/s]
100%|██████████| 3480/3480 [00:00<00:00, 596663.45it/s]

(3479, 15)





In [16]:
# 상행첫차, 상행막차, 하행첫차, 하행막차 type 바꿔주기 
route['상행첫차'] = route['상행첫차'].progress_apply(lambda x: pd.to_datetime(str(x), format='%H:%M'))
route['상행막차'] = route['상행막차'].progress_apply(lambda x: pd.to_datetime(str(x), format='%H:%M'))
route['하행첫차'] = route['하행첫차'].progress_apply(lambda x: pd.to_datetime(str(x), format='%H:%M'))
route['하행막차'] = route['하행막차'].progress_apply(lambda x: pd.to_datetime(str(x), format='%H:%M'))

100%|██████████| 3479/3479 [00:00<00:00, 4738.72it/s]
100%|██████████| 3479/3479 [00:00<00:00, 5182.45it/s]
100%|██████████| 3479/3479 [00:00<00:00, 5260.64it/s]
100%|██████████| 3479/3479 [00:00<00:00, 5236.43it/s]


In [17]:
# 운행시간 구하기 
def operation_time(first, last) : 
    if last >= first  : 
        return last - first 
    else : 
        last += timedelta(days=1)
        return last - first 

In [18]:
route["상행운행시간"] = route.progress_apply(lambda x : operation_time(x["상행첫차"], x["상행막차"]), axis=1)
route["하행운행시간"] = route.progress_apply(lambda x : operation_time(x["하행첫차"], x["하행막차"]), axis=1)

100%|██████████| 3479/3479 [00:00<00:00, 13496.92it/s]
100%|██████████| 3479/3479 [00:00<00:00, 12438.86it/s]


In [147]:
# Data Merge 
tmp3 = pd.merge(left=tmp2, right=route.drop(["순번"], axis=1).drop_duplicates('노선ID'), how='left', \
                on=['노선ID','노선번호'])

print("bus+trans+route shape : ", tmp3.shape , '\n') 
print("bus 데이터의 unique bus 개수 : ", bus.정류소ID.nunique())
print("tmp2 데이터의 unique bus 개수 : ", tmp2.정류소ID.nunique())
print("합친 데이터의 unique bus 개수 : ", tmp3.정류소ID.nunique())
print("=====" * 20)
tmp3.head()

bus+trans+route shape :  (1364613, 41) 

bus 데이터의 unique bus 개수 :  516
tmp2 데이터의 unique bus 개수 :  516
합친 데이터의 unique bus 개수 :  516


Unnamed: 0,일자,정류소ID,노선번호,노선유형,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,정류장명,정류장유형,환승역타입,위치(동),쉘터,BIS설치여부,LED,LCD,LED+LCD복합형,알뜰형,임대형(음성),lon,lat,운행노선,중앙차로여부,노선ID,노선순서,관할관청,운행업체,기점,기점_STATION_ID,종점,종점_STATION_ID,주중배차간격,주말배차간격,상행첫차,상행막차,하행첫차,하행막차,상행운행시간,하행운행시간
0,20200101,200000006,13,일반형시내버스,166,160,6,67,67,0,광교공원.경기대수원캠퍼스입구.연무시장,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"13(수원여객),16(수원여객),16-1(수원여객),16-2(수원여객),32-5(수...",노변정류장,200000037.0,13.0,수원시,수원여객,상광교동종점,200000275.0,칠보고등학교.칠보중학교.극동스타클래스,201000099.0,5분~7분,5분~7분,1900-01-01 06:00:00,1900-01-01 22:00:00,1900-01-01 05:40:00,1900-01-01 23:10:00,0 days 16:00:00,0 days 17:30:00
1,20200101,200000006,37,일반형시내버스,36,33,3,0,0,0,광교공원.경기대수원캠퍼스입구.연무시장,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"13(수원여객),16(수원여객),16-1(수원여객),16-2(수원여객),32-5(수...",노변정류장,200000099.0,1.0,수원시,수원여객,광교공원.경기대수원캠퍼스입구.연무시장,200000006.0,한국민속촌.보라해링턴,228001646.0,10분~13분,15분~20분,1900-01-01 04:50:00,1900-01-01 21:40:00,1900-01-01 06:00:00,1900-01-01 23:10:00,0 days 16:50:00,0 days 17:10:00
2,20200101,200000006,7_1,일반형시내버스,187,162,25,190,190,0,광교공원.경기대수원캠퍼스입구.연무시장,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"13(수원여객),16(수원여객),16-1(수원여객),16-2(수원여객),32-5(수...",노변정류장,200000045.0,55.0,수원시,수원여객,동탄차고지,233001222.0,광교공원.경기대수원캠퍼스입구.연무시장,200000006.0,7분~9분,9분~12분,1900-01-01 05:00:00,1900-01-01 22:40:00,1900-01-01 06:00:00,1900-01-01 23:45:00,0 days 17:40:00,0 days 17:45:00
3,20200101,200000008,13,일반형시내버스,127,127,0,46,46,0,문암골,시내,일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.0277,37.308946,13(수원여객),노변정류장,200000037.0,11.0,수원시,수원여객,상광교동종점,200000275.0,칠보고등학교.칠보중학교.극동스타클래스,201000099.0,5분~7분,5분~7분,1900-01-01 06:00:00,1900-01-01 22:00:00,1900-01-01 05:40:00,1900-01-01 23:10:00,0 days 16:00:00,0 days 17:30:00
4,20200101,200000036,112,일반형시내버스,36,34,2,76,76,0,풍림아파트입구,"시내,마을",일반,경기도 수원시 정자3동,1.0,1.0,0.0,0.0,1.0,0.0,0.0,126.995257,37.297929,112(수원여객),노변정류장,200000049.0,29.0,수원시,수원여객,곡반정동차고지,201000143.0,웅비아파트,201000330.0,7분~9분,9분~12분,1900-01-01 05:00:00,1900-01-01 22:50:00,1900-01-01 06:05:00,1900-01-01 23:55:00,0 days 17:50:00,0 days 17:50:00


## 1-5. dataframe split 
* 결측값 문제를 해결하기 위해 일단 데이터 split 
* 노선 정보가 없는 데이터는 고려하지 않아도 괜찮을 것 같음 

In [148]:
# 결측값 없는 dataframe index 
idx = tmp3.dropna(axis=0).index

final = tmp3.loc[idx]
final_na = tmp3[~tmp3.index.isin(idx)]

# check 
print("* 결측값 제외한 dataframe shape : ", final.shape) 
print("* 결측값 제외한 dataframe unique 버스정류장 개수 :", final.정류소ID.nunique())
print("=====" * 20)
print("* 결측값 있는 dataframe shape : ", final_na.shape)  
print("* 결측값 있는 dataframe unique 버스정류장 개수 :", final_na.정류소ID.nunique())

tmp3.shape[0] == (final.shape[0] + final_na.shape[0])

* 결측값 제외한 dataframe shape :  (1326103, 41)
* 결측값 제외한 dataframe unique 버스정류장 개수 : 516
* 결측값 있는 dataframe shape :  (38510, 41)
* 결측값 있는 dataframe unique 버스정류장 개수 : 173


True

## 1-6. 추가 전처리 

In [150]:
# 일자 data type 바꿔주기 (오래걸림)
final['일자'] = final['일자'].progress_apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))

100%|██████████| 1326103/1326103 [04:26<00:00, 4977.13it/s]


In [151]:
# 중앙차로여부 
final["중앙차로여부"] = final["중앙차로여부"].progress_apply(lambda x : 1 if x == '중앙차로 정류장' else 0)

100%|██████████| 1326103/1326103 [00:01<00:00, 684302.92it/s]


In [152]:
# 운행노선 / 정류장유형 
final["운행노선"] = final["운행노선"].progress_apply(lambda x : x.split(','))
final["운행노선수"] = final["운행노선"].progress_apply(lambda x : len(x))

final["정류장유형"] = final["정류장유형"].progress_apply(lambda x : x.split(','))
final["정류장유형수"] = final["정류장유형"].progress_apply(lambda x : len(x))

100%|██████████| 1326103/1326103 [00:09<00:00, 140116.22it/s]
100%|██████████| 1326103/1326103 [00:02<00:00, 649499.48it/s]
100%|██████████| 1326103/1326103 [00:05<00:00, 251755.31it/s]
100%|██████████| 1326103/1326103 [00:01<00:00, 680922.14it/s]


## 1-7. Bus Stop Data 
* **busstop** : tmp (버스 고유 정보) + route + trans 
* 일별 정보 없는 **버스 정류장** 데이터

In [153]:
busstop = final.drop(['일자', '전체 승차 건수', '초승 건수', '환승 건수', '전체 하차 건수', '하차 건수', '미태그 건수'], axis=1)
busstop = busstop.drop_duplicates(["정류소ID", "노선번호", '노선유형'])

# 해당 노선의 총 승차건수 데이터 
busstop_tmp = final.groupby(by=['정류소ID', '노선번호', '노선유형']).mean([['전체승차건수','초승건수','환승건수','전체하차건수','하차건수','미태그건수']]).reset_index()

# Data Merge 
busstop = pd.merge(left=busstop, right=busstop_tmp[['정류소ID', '노선번호', '노선유형', '전체 승차 건수', '초승 건수', '환승 건수', '전체 하차 건수', '하차 건수', '미태그 건수']], 
                   how='left', on=['정류소ID', '노선번호', '노선유형'])

print("bus 노선 정보 dataframe shape : ", busstop.shape , '\n') 
print("합친 데이터의 unique bus 개수 : ", busstop.정류소ID.nunique())
print("=====" * 20)
busstop.head()

bus 노선 정보 dataframe shape :  (3926, 42) 

합친 데이터의 unique bus 개수 :  516


Unnamed: 0,정류소ID,노선번호,노선유형,정류장명,정류장유형,환승역타입,위치(동),쉘터,BIS설치여부,LED,LCD,LED+LCD복합형,알뜰형,임대형(음성),lon,lat,운행노선,중앙차로여부,노선ID,노선순서,관할관청,운행업체,기점,기점_STATION_ID,종점,종점_STATION_ID,주중배차간격,주말배차간격,상행첫차,상행막차,하행첫차,하행막차,상행운행시간,하행운행시간,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수
0,200000006,13,일반형시내버스,광교공원.경기대수원캠퍼스입구.연무시장,[시내],일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"[13(수원여객), 16(수원여객), 16-1(수원여객), 16-2(수원여객), 3...",0,200000037.0,13.0,수원시,수원여객,상광교동종점,200000275.0,칠보고등학교.칠보중학교.극동스타클래스,201000099.0,5분~7분,5분~7분,1900-01-01 06:00:00,1900-01-01 22:00:00,1900-01-01 05:40:00,1900-01-01 23:10:00,0 days 16:00:00,0 days 17:30:00,9,1,212.605479,206.279452,6.326027,51.257534,51.257534,0.0
1,200000006,37,일반형시내버스,광교공원.경기대수원캠퍼스입구.연무시장,[시내],일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"[13(수원여객), 16(수원여객), 16-1(수원여객), 16-2(수원여객), 3...",0,200000099.0,1.0,수원시,수원여객,광교공원.경기대수원캠퍼스입구.연무시장,200000006.0,한국민속촌.보라해링턴,228001646.0,10분~13분,15분~20분,1900-01-01 04:50:00,1900-01-01 21:40:00,1900-01-01 06:00:00,1900-01-01 23:10:00,0 days 16:50:00,0 days 17:10:00,9,1,48.726027,43.556164,5.169863,4.813699,4.813699,0.0
2,200000006,7_1,일반형시내버스,광교공원.경기대수원캠퍼스입구.연무시장,[시내],일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.029464,37.300014,"[13(수원여객), 16(수원여객), 16-1(수원여객), 16-2(수원여객), 3...",0,200000045.0,55.0,수원시,수원여객,동탄차고지,233001222.0,광교공원.경기대수원캠퍼스입구.연무시장,200000006.0,7분~9분,9분~12분,1900-01-01 05:00:00,1900-01-01 22:40:00,1900-01-01 06:00:00,1900-01-01 23:45:00,0 days 17:40:00,0 days 17:45:00,9,1,237.380822,213.057534,24.323288,210.950685,210.950685,0.0
3,200000008,13,일반형시내버스,문암골,[시내],일반,경기도 수원시 연무동,0.0,1.0,0.0,0.0,1.0,0.0,0.0,127.0277,37.308946,[13(수원여객)],0,200000037.0,11.0,수원시,수원여객,상광교동종점,200000275.0,칠보고등학교.칠보중학교.극동스타클래스,201000099.0,5분~7분,5분~7분,1900-01-01 06:00:00,1900-01-01 22:00:00,1900-01-01 05:40:00,1900-01-01 23:10:00,0 days 16:00:00,0 days 17:30:00,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0
4,200000036,112,일반형시내버스,풍림아파트입구,"[시내, 마을]",일반,경기도 수원시 정자3동,1.0,1.0,0.0,0.0,1.0,0.0,0.0,126.995257,37.297929,[112(수원여객)],0,200000049.0,29.0,수원시,수원여객,곡반정동차고지,201000143.0,웅비아파트,201000330.0,7분~9분,9분~12분,1900-01-01 05:00:00,1900-01-01 22:50:00,1900-01-01 06:05:00,1900-01-01 23:55:00,0 days 17:50:00,0 days 17:50:00,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0


# 2. Bus Data Feature Engineering 
1. **tmp** (518, 16) : 버스 정류장 고유 정보 데이터, df_01 + df_07 
2. **busstop** (3921, 36) : tmp (버스 고유 정보) + route + trans
3. **final** (1326103, 43) : 일별 노선 승하차 정보 포함 데이터 

## 2-1. busstop data Feature Engineering 
* 버스 노선 고유 정보 feature engineering 

### (1) 배차 간격 

In [27]:
# 배차간격 
def interval(x) : 
    try : 
        itv = x.split("~")
        a, b = int(itv[0][:-1]), int(itv[1][:-1])
        return (a+b)/2
    except :
        return int(x[:-1])

In [154]:
# 분 단위 배차간격 
busstop["주중배차간격(분)"] = busstop["주중배차간격"].progress_apply(lambda x : interval(x))
busstop["주말배차간격(분)"] = busstop["주말배차간격"].progress_apply(lambda x : interval(x))

busstop = busstop.drop(["주중배차간격", "주말배차간격"], axis=1)

100%|██████████| 3926/3926 [00:00<00:00, 192385.33it/s]
100%|██████████| 3926/3926 [00:00<00:00, 192626.13it/s]


### (2) 배차 횟수

In [29]:
# 배차횟수 = 운행시간 / 배차간격 
def operation_num(duration, interval) : 
    duration2, interval2 = duration.total_seconds() , interval * 60
    return duration2 / interval2

In [155]:
busstop["주중상행배차횟수"] = busstop.progress_apply(lambda x : operation_num(x["상행운행시간"], x["주중배차간격(분)"]), axis=1)
busstop["주중하행배차횟수"] = busstop.progress_apply(lambda x : operation_num(x["하행운행시간"], x["주중배차간격(분)"]), axis=1)

busstop["주말상행배차횟수"] = busstop.progress_apply(lambda x : operation_num(x["상행운행시간"], x["주말배차간격(분)"]), axis=1)
busstop["주말하행배차횟수"] = busstop.progress_apply(lambda x : operation_num(x["하행운행시간"], x["주말배차간격(분)"]), axis=1)

100%|██████████| 3926/3926 [00:00<00:00, 17041.95it/s]
100%|██████████| 3926/3926 [00:00<00:00, 14409.39it/s]
100%|██████████| 3926/3926 [00:00<00:00, 16066.57it/s]
100%|██████████| 3926/3926 [00:00<00:00, 15761.40it/s]


### (3) 버스 유형 

In [156]:
busstop = pd.get_dummies(busstop, columns=["노선유형"])

### (4) 노선 별 승차 건수 비율 

In [32]:
def div(col1, col2) : 
    try : 
        return col1 / col2 
    except : 
        return 0 

In [157]:
# 초승 환승 전체승차 하차 
busstop["초승_환승비율"] = busstop.progress_apply(lambda x : div(x['초승 건수'], x['환승 건수']), axis=1)
busstop["초승_승차비율"] = busstop.progress_apply(lambda x : div(x['초승 건수'], x['전체 승차 건수']), axis=1)
busstop["초승_하차비율"] = busstop.progress_apply(lambda x : div(x['초승 건수'], x['전체 하차 건수']), axis=1)

busstop["환승_승차비율"] = busstop.progress_apply(lambda x : div(x['환승 건수'], x['전체 승차 건수']), axis=1)
busstop["환승_하차비율"] = busstop.progress_apply(lambda x : div(x['환승 건수'], x['전체 하차 건수']), axis=1)

busstop["승차_하차비율"] = busstop.progress_apply(lambda x : div(x['전체 승차 건수'], x['전체 하차 건수']), axis=1)

100%|██████████| 3926/3926 [00:00<00:00, 17192.80it/s]
100%|██████████| 3926/3926 [00:00<00:00, 15736.96it/s]
100%|██████████| 3926/3926 [00:00<00:00, 15440.80it/s]
100%|██████████| 3926/3926 [00:00<00:00, 16186.20it/s]
100%|██████████| 3926/3926 [00:00<00:00, 15619.76it/s]
100%|██████████| 3926/3926 [00:00<00:00, 16367.18it/s]


### (5) 승차 많은 / 환승 많은 / 하차 많은 버스 노선 
* 분모 : 전체 승차 건수 + 전체 하차 건수 
* 분자 : 승차 건수 / 환승 건수 / 하차 건수 

In [158]:
busstop["전체승하차건수"] = busstop.progress_apply(lambda x : x['전체 승차 건수'] + x['전체 하차 건수'], axis=1)

busstop["승차비율"] = busstop.progress_apply(lambda x : div(x['초승 건수'], x['전체승하차건수']), axis=1)
busstop["환승비율"] = busstop.progress_apply(lambda x : div(x['환승 건수'], x['전체승하차건수']), axis=1)
busstop["하차비율"] = busstop.progress_apply(lambda x : div(x['하차 건수'], x['전체승하차건수']), axis=1)

100%|██████████| 3926/3926 [00:00<00:00, 16768.40it/s]
100%|██████████| 3926/3926 [00:00<00:00, 16746.13it/s]
100%|██████████| 3926/3926 [00:00<00:00, 14280.78it/s]
100%|██████████| 3926/3926 [00:00<00:00, 14974.10it/s]


In [159]:
busstop["승차많은버스"] = busstop["승차비율"].progress_apply(lambda x : 1 if x > 0.7 else 0)
busstop["환승많은버스"] = busstop["환승비율"].progress_apply(lambda x : 1 if x > 0.3 else 0)
busstop["하차많은버스"] = busstop["하차비율"].progress_apply(lambda x : 1 if x > 0.8 else 0)

100%|██████████| 3926/3926 [00:00<00:00, 448223.57it/s]
100%|██████████| 3926/3926 [00:00<00:00, 553414.13it/s]
100%|██████████| 3926/3926 [00:00<00:00, 622000.36it/s]


In [160]:
'''
<mean>
승하차 건수 / 승하차 비율 
배차 간격 / 배차 횟수 

<sum>
노선 유형
승하차 많은 버스 
'''

temp_mean = busstop[['정류소ID'] + list(busstop.columns[busstop.columns.str.contains("건수|비율|배차")])].groupby('정류소ID').mean().reset_index()
temp_sum  = busstop[['정류소ID'] + list(busstop.columns[busstop.columns.str.contains("노선유형|많은")])].groupby('정류소ID').sum().reset_index()

temp_mean.shape, temp_sum.shape

((516, 23), (516, 10))

### (6) 첫차 / 막차 시간 

In [161]:
# 첫차 : 가장 빠른 시간 , 막차 : 가장 늦은 시간 
time1 = busstop.sort_values('상행첫차').groupby('정류소ID').first().reset_index()
time2 = busstop.sort_values('상행막차').groupby('정류소ID').last().reset_index()
time3 = busstop.sort_values('하행첫차').groupby('정류소ID').first().reset_index()
time4 = busstop.sort_values('하행막차').groupby('정류소ID').last().reset_index()

time1.shape, time2.shape, time3.shape, time4.shape

((516, 64), (516, 64), (516, 64), (516, 64))

### (7) Data Merge 
* **busstop** : 버스 정류장 정보, 분석에 필요한 column 만 keep 
* **temp_mean / temp_sum** : 버스 노선 정보 통계량 
* **time** : 첫차 / 막차 시간 

In [213]:
# busstop 
busdf = busstop[['정류소ID', '정류장명', '쉘터', 'LED', 'LCD', 'LED+LCD복합형', '알뜰형', 'lon', 'lat', 
                 '중앙차로여부', '운행노선수', '정류장유형수']].drop_duplicates(['정류소ID']).reset_index(drop=True)  

# temp_mean / temp_sum 
busdf = busdf.merge(temp_mean, how='left', on='정류소ID')
busdf = busdf.merge(temp_sum, how='left', on='정류소ID')

# time 
busdf = busdf.merge(time1[["정류소ID", '상행첫차']], on = '정류소ID', how='left')
busdf = busdf.merge(time2[["정류소ID", '상행막차']], on = '정류소ID', how='left')
busdf = busdf.merge(time3[["정류소ID", '하행첫차']], on = '정류소ID', how='left')
busdf = busdf.merge(time4[["정류소ID", '하행막차']], on = '정류소ID', how='left')

In [214]:
# 평균 운행 시간 
busdf["평균상행운행시간"] = busdf.progress_apply(lambda x : operation_time(x["상행첫차"], x["상행막차"]), axis=1)
busdf["평균하행운행시간"] = busdf.progress_apply(lambda x : operation_time(x["하행첫차"], x["하행막차"]), axis=1)

# 배차 횟수는 이상치에 너무 영향을 많이 받아서 .. 이거로 안 하는게 좋을 것 같아용 ㅠㅠ 
# busdf["평균상행배차횟수"] = busdf.progress_apply(lambda x : operation_num(x["평균상행운행시간"], (x["주중배차간격(분)"]+x['주말배차간격(분)'])/2), axis=1)
# busdf["평균하행배차횟수"] = busdf.progress_apply(lambda x : operation_num(x["평균하행운행시간"], (x["주중배차간격(분)"]+x['주말배차간격(분)'])/2), axis=1)

# 운행 시간 dummy variable 
busdf["평균상행운행시간"] = busdf["평균상행운행시간"].progress_apply(lambda x : 1 if x >  timedelta(hours=18, minutes=20) else 0)
busdf["평균하행운행시간"] = busdf["평균하행운행시간"].progress_apply(lambda x : 1 if x >  timedelta(hours=18, minutes=40) else 0)

busdf = busdf.drop(['상행첫차', '상행막차', '하행첫차', '하행막차'], axis=1)

100%|██████████| 516/516 [00:00<00:00, 16510.99it/s]
100%|██████████| 516/516 [00:00<00:00, 11218.32it/s]
100%|██████████| 516/516 [00:00<00:00, 38906.66it/s]
100%|██████████| 516/516 [00:00<00:00, 28615.68it/s]


In [197]:
busdf.평균상행운행시간.describe()

count                          516
mean     0 days 18:15:41.046511627
std      0 days 00:36:34.557714638
min                0 days 15:15:00
25%                0 days 18:00:00
50%                0 days 18:20:00
75%                0 days 18:40:00
max                0 days 19:50:00
Name: 평균상행운행시간, dtype: object

In [198]:
busdf.평균하행운행시간.describe()

count                          516
mean     0 days 18:24:06.046511627
std      0 days 00:42:00.048209927
min                0 days 15:25:00
25%                0 days 18:00:00
50%                0 days 18:40:00
75%                0 days 18:55:00
max                0 days 19:15:00
Name: 평균하행운행시간, dtype: object

In [215]:
print(busdf.shape)
print("버스 정류장 unique 개수 : ", busdf.정류소ID.nunique())
print("=====" * 20)
busdf.head(3)

(516, 45)
버스 정류장 unique 개수 :  516


Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0
2,200000036,풍림아파트입구,1.0,0.0,0.0,1.0,0.0,126.995257,37.297929,0,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0,8.0,10.5,133.75,133.75,101.904762,101.904762,10.415812,0.912402,0.673753,0.087598,0.064686,0.738438,172.29589,0.387562,0.037209,0.575229,0,0,0,1,0,0,0,0,0,0,0


## 2-2. final Data Feature Engineering 
* 일별 정보 반영 feature engineering 

In [None]:
# 향후 추가 예정 ~ 

# 3. 미세먼지 데이터 

In [216]:
# 호매실동 빈값 3개 채워넣기
df_11.iloc[6,3:9]=df_11.iloc[102,3:9]
df_11.iloc[14,3:9]=df_11.iloc[110,3:9]
df_11.iloc[22,3:9]=df_11.iloc[118,3:9]

# 데이터 형변환
df_11[["미세먼지(㎍/㎥)","초미세먼지(㎍/㎥)","오존(ppm)","이산화질소(ppm)","아황산가스(ppm)","일산화탄소(ppm)"]]=df_11[["미세먼지(㎍/㎥)","초미세먼지(㎍/㎥)","오존(ppm)","이산화질소(ppm)","아황산가스(ppm)","일산화탄소(ppm)"]].astype(float)

# 2019년, 2020년 데이터 평균
df_dust = df_11.groupby(df_11["측정소"]).mean().reset_index()

In [217]:
def dist(bus, dust, dong):
    
    ''' 
    측정소랑 버스정류장 사이의 거리 구하는 함수
    dong: '동이름' 으로 설정하기 
    '''
    
    bus[dong+"거리"] = 0
    end = (dust[dust['측정소'] == dong]['lon'], dust[dust['측정소']== dong]['lat'])
    
    for i in range(len(bus)):
        start_lon, start_lat = bus["lon"].iloc[i], bus["lat"].iloc[i]
        start=(start_lon,start_lat)
        km=haversine(start,end,unit="km")
        bus[dong+"거리"].iloc[i]=km

In [218]:
# 각 동의 측정소와 버스정류장 사이의 거리 구하기
for i in df_dust['측정소'].unique():
    dist(busdf, df_dust, i)

In [219]:
# 가장 가까운 측정소의 미세먼지 값 넣기
busdf["미세먼지(㎍/㎥)"], busdf["초미세먼지(㎍/㎥)"], busdf["오존(ppm)"], busdf["이산화질소(ppm)"], busdf["아황산가스(ppm)"], busdf["일산화탄소(ppm)"] = 0, 0, 0, 0, 0, 0

for i in range(len(busdf)):
    idx = np.argmin(busdf[['고색동거리', '광교동거리', '동수원거리', '신풍동거리', '영통동거리', '인계동거리', '천천동거리', '호매실동거리']].iloc[i])
    
    busdf["미세먼지(㎍/㎥)"].iloc[i]=df_dust.iloc[idx]["미세먼지(㎍/㎥)"]
    busdf["초미세먼지(㎍/㎥)"].iloc[i]=df_dust.iloc[idx]["초미세먼지(㎍/㎥)"]
    busdf["오존(ppm)"].iloc[i]=df_dust.iloc[idx]["오존(ppm)"]
    busdf["이산화질소(ppm)"].iloc[i]=df_dust.iloc[idx]["이산화질소(ppm)"]
    busdf["아황산가스(ppm)"].iloc[i]=df_dust.iloc[idx]["아황산가스(ppm)"]
    busdf["일산화탄소(ppm)"].iloc[i]=df_dust.iloc[idx]["일산화탄소(ppm)"]

In [220]:
print(busdf.shape)
busdf.head(3)

(516, 59)


Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,고색동거리,광교동거리,동수원거리,신풍동거리,영통동거리,인계동거리,천천동거리,호매실동거리,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm)
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,6.709683,4.612747,1.562486,2.373598,4.643121,2.437335,6.04108,8.369192,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,6.84948,4.915871,2.178657,2.548656,5.232781,3.036157,5.92447,8.365801,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167
2,200000036,풍림아파트입구,1.0,0.0,0.0,1.0,0.0,126.995257,37.297929,0,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0,8.0,10.5,133.75,133.75,101.904762,101.904762,10.415812,0.912402,0.673753,0.087598,0.064686,0.738438,172.29589,0.387562,0.037209,0.575229,0,0,0,1,0,0,0,0,0,0,0,3.702636,8.391093,4.16688,1.935795,7.598513,4.372229,2.244175,4.754531,39.458333,23.125,0.027917,0.024917,0.002958,0.5625


# 4. 지하철역 데이터 

In [221]:
def dist2(bus, station):
    
    ''' 
    지하철역랑 버스정류장 사이의 거리 구하는 함수
    station: '역사이름'으로 설정하기 
    '''  
    
    bus[station] = 0
    end = (df_08[df_08['역사명']==station]['lon'] , df_08[df_08['역사명']==station]['lat'])
    
    for i in range(len(bus)):
        start_lon, start_lat = bus["lon"].iloc[i], bus["lat"].iloc[i]
        start=(start_lon,start_lat)
        km = haversine(start, end, unit = 'km')
        bus[station].iloc[i] = km

In [222]:
for i in df_08['역사명'].unique():
    dist2(busdf, i)

In [223]:
busdf.head(2)

Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,고색동거리,광교동거리,동수원거리,신풍동거리,영통동거리,인계동거리,천천동거리,호매실동거리,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm),성균관대역,화서역,수원(분당)역,세류역,청명역,영통역,망포역,매탄권선역,수원시청역,매교역,고색역,오목천역,광교중앙(아주대)역,광교(경기대)역,수원역
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,6.709683,4.612747,1.562486,2.373598,4.643121,2.437335,6.04108,8.369192,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,6.516358,4.562602,3.999946,4.130102,6.135732,5.67792,4.776889,3.426342,2.553497,2.774218,6.679065,8.25395,2.564164,1.675973,3.994276
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,6.84948,4.915871,2.178657,2.548656,5.232781,3.036157,5.92447,8.365801,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,6.346367,4.555895,4.220473,4.609356,6.59181,6.192714,5.364495,4.054839,3.165719,3.202148,6.844825,8.379205,2.974036,1.921543,4.220959


In [224]:
# 지하철역 500m 안에 있으면 1 아니면 0
for station in df_08['역사명'].unique() : 
    busdf[station] = busdf[station].apply(lambda x : 1 if x <= 0.5 else 0)
    
busdf["500m내지하철역갯수"] = busdf[df_08['역사명'].unique()].apply(sum, axis=1)

# 역 정보 삭제 
busdf = busdf.drop(list(df_08['역사명'].unique()), axis=1)

In [225]:
busdf["500m내지하철역갯수"].value_counts()

0    386
1    118
2     12
Name: 500m내지하철역갯수, dtype: int64

In [226]:
print(busdf.shape)
busdf.head(3)

(516, 60)


Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,고색동거리,광교동거리,동수원거리,신풍동거리,영통동거리,인계동거리,천천동거리,호매실동거리,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm),500m내지하철역갯수
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,6.709683,4.612747,1.562486,2.373598,4.643121,2.437335,6.04108,8.369192,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,6.84948,4.915871,2.178657,2.548656,5.232781,3.036157,5.92447,8.365801,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0
2,200000036,풍림아파트입구,1.0,0.0,0.0,1.0,0.0,126.995257,37.297929,0,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0,8.0,10.5,133.75,133.75,101.904762,101.904762,10.415812,0.912402,0.673753,0.087598,0.064686,0.738438,172.29589,0.387562,0.037209,0.575229,0,0,0,1,0,0,0,0,0,0,0,3.702636,8.391093,4.16688,1.935795,7.598513,4.372229,2.244175,4.754531,39.458333,23.125,0.027917,0.024917,0.002958,0.5625,0


# 5. 계절별 미세먼지 

In [227]:
df_11["년"] = df_11.측정월.str.split(".").str[0]
df_11["월"] = df_11.측정월.str.split(".").str[1]

df_11['계절'] = ["겨울" if s in ("1월", "2월", "12월") 
               else "봄" if s in ("3월", "4월", "5월")   
               else "여름"  if s in ("6월","7월","8월") 
               else "가을" for s in df_11["월"]]

df_dust2 = df_11.groupby(["측정소","계절"]).mean().reset_index()

In [228]:
busdf["미세_봄"], busdf["미세_여름"], busdf["미세_가을"], busdf["미세_겨울"] = 0, 0, 0, 0
busdf["초미세_봄"], busdf["초미세_여름"], busdf["초미세_가을"], busdf["초미세_겨울"] = 0, 0, 0, 0

In [229]:
for i in range(len(busdf)):
    place = busdf[['고색동거리', '광교동거리', '동수원거리', '신풍동거리', '영통동거리', '인계동거리', '천천동거리', '호매실동거리']].iloc[i].idxmin()[:-2]

    busdf["미세_봄"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "봄")]["미세먼지(㎍/㎥)"].values
    busdf["미세_여름"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "여름")]["미세먼지(㎍/㎥)"].values
    busdf["미세_가을"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "가을")]["미세먼지(㎍/㎥)"].values
    busdf["미세_겨울"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "겨울")]["미세먼지(㎍/㎥)"].values

    busdf["초미세_봄"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "봄")]["초미세먼지(㎍/㎥)"].values
    busdf["초미세_여름"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "여름")]["초미세먼지(㎍/㎥)"].values
    busdf["초미세_가을"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "가을")]["초미세먼지(㎍/㎥)"].values
    busdf["초미세_겨울"].iloc[i] = df_dust2[(df_dust2["측정소"] == place) & (df_dust2["계절"] == "겨울")]["초미세먼지(㎍/㎥)"].values

In [231]:
busdf.head(2)

Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,고색동거리,광교동거리,동수원거리,신풍동거리,영통동거리,인계동거리,천천동거리,호매실동거리,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm),500m내지하철역갯수,미세_봄,미세_여름,미세_가을,미세_겨울,초미세_봄,초미세_여름,초미세_가을,초미세_겨울
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,6.709683,4.612747,1.562486,2.373598,4.643121,2.437335,6.04108,8.369192,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,6.84948,4.915871,2.178657,2.548656,5.232781,3.036157,5.92447,8.365801,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5


In [234]:
# 미세먼지 측정소 거리 삭제 
busdf  = busdf.drop(list(busdf.columns[busdf.columns.str.contains("거리")]), axis=1)

In [235]:
print(busdf.shape)
busdf.head(3)

(516, 60)


Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm),500m내지하철역갯수,미세_봄,미세_여름,미세_가을,미세_겨울,초미세_봄,초미세_여름,초미세_가을,초미세_겨울
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5
2,200000036,풍림아파트입구,1.0,0.0,0.0,1.0,0.0,126.995257,37.297929,0,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0,8.0,10.5,133.75,133.75,101.904762,101.904762,10.415812,0.912402,0.673753,0.087598,0.064686,0.738438,172.29589,0.387562,0.037209,0.575229,0,0,0,1,0,0,0,0,0,0,0,39.458333,23.125,0.027917,0.024917,0.002958,0.5625,0,48.0,29.833333,32.333333,47.666667,26.0,16.333333,17.666667,32.5


# 6. 실거래가 

In [236]:
house = pd.read_csv(current_path+'/data/house.csv', encoding = 'CP949')

house = house.drop_duplicates(['road'], keep = 'last').reset_index() # 가장 최근의 실거래가 이용 
house = house[['road', '건축년도', '전용면적(㎡)', '층', '거래금액', 'price', '경도', '위도']]

In [237]:
def optim_price(station, price):
    for i in range(len(station)):
        # 거리 계산
        distance = []
        for j in range(len(price)):
            station_name = tuple(station[['lon','lat']].iloc[i])
            price_name = tuple(price[['경도','위도']].iloc[j])
            distance.append(haversine(station_name, price_name, unit = 'km'))
            
        # 최소
        station['건축년도'][i] = price['건축년도'][np.argmin(distance)]
        station['전용면적(㎡)'][i] = price['전용면적(㎡)'][np.argmin(distance)]
        station['층'][i] = price['층'][np.argmin(distance)]
        station['거래금액'][i] = price['거래금액'][np.argmin(distance)]
        station['면적당 금액'][i] = price['price'][np.argmin(distance)]
        
    return station

In [238]:
busdf['건축년도'], busdf['전용면적(㎡)'], busdf['층'], busdf['거래금액'], busdf['면적당 금액'] = 0, 0, 0, 0, 0
busdf = optim_price(busdf, house)

In [239]:
print(busdf.shape)
busdf.head(3)

(516, 65)


Unnamed: 0,정류소ID,정류장명,쉘터,LED,LCD,LED+LCD복합형,알뜰형,lon,lat,중앙차로여부,운행노선수,정류장유형수,전체 승차 건수,초승 건수,환승 건수,전체 하차 건수,하차 건수,미태그 건수,주중배차간격(분),주말배차간격(분),주중상행배차횟수,주중하행배차횟수,주말상행배차횟수,주말하행배차횟수,초승_환승비율,초승_승차비율,초승_하차비율,환승_승차비율,환승_하차비율,승차_하차비율,전체승하차건수,승차비율,환승비율,하차비율,노선유형_경기순환버스(직행좌석형),노선유형_광역급행형시내버스,노선유형_맞춤형시내버스,노선유형_일반형시내버스,노선유형_좌석형시내버스,노선유형_직행좌석형시내버스,승차많은버스,환승많은버스,하차많은버스,평균상행운행시간,평균하행운행시간,미세먼지(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소(ppm),아황산가스(ppm),일산화탄소(ppm),500m내지하철역갯수,미세_봄,미세_여름,미세_가을,미세_겨울,초미세_봄,초미세_여름,초미세_가을,초미세_겨울,건축년도,전용면적(㎡),층,거래금액,면적당 금액
0,200000006,광교공원.경기대수원캠퍼스입구.연무시장,0.0,0.0,0.0,1.0,0.0,127.029464,37.300014,0,9,1,75.201863,69.878308,5.323555,33.747443,33.747443,0.0,98.3125,99.625,61.909596,64.017341,52.366504,54.389856,12.771369,0.909413,14.400385,0.090587,1.121787,15.522172,108.949306,0.764578,0.075057,0.160365,0,0,0,8,0,0,7,0,0,0,0,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5,2003,75,4,21700,288
1,200000008,문암골,0.0,0.0,0.0,1.0,0.0,127.0277,37.308946,0,1,1,77.49863,77.2,0.29863,36.594521,36.594521,0.0,6.0,6.0,160.0,175.0,160.0,175.0,258.513761,0.996147,2.109605,0.003853,0.008161,2.117766,114.093151,0.67664,0.002617,0.320742,0,0,0,1,0,0,0,0,0,0,0,49.458333,24.166667,0.01825,0.034458,0.003042,0.704167,0,61.0,36.333333,39.666667,60.833333,28.333333,15.666667,17.166667,35.5,2003,75,4,21700,288
2,200000036,풍림아파트입구,1.0,0.0,0.0,1.0,0.0,126.995257,37.297929,0,1,2,73.186301,66.775342,6.410959,99.109589,99.109589,0.0,8.0,10.5,133.75,133.75,101.904762,101.904762,10.415812,0.912402,0.673753,0.087598,0.064686,0.738438,172.29589,0.387562,0.037209,0.575229,0,0,0,1,0,0,0,0,0,0,0,39.458333,23.125,0.027917,0.024917,0.002958,0.5625,0,48.0,29.833333,32.333333,47.666667,26.0,16.333333,17.666667,32.5,1989,84,1,29000,341


# 7. Data Save 

In [241]:
# 경로 설정 주의 ~ (xlsx)
busdf.to_excel(current_path+'/busdata/busdata_0318.xlsx', index = False, encoding = 'CP949')

In [242]:
# xlsx
a = pd.read_excel(current_path+'/busdata/busdata_0318.xlsx')

In [250]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 65 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   정류소ID               516 non-null    int64  
 1   정류장명                516 non-null    object 
 2   쉘터                  516 non-null    int64  
 3   LED                 516 non-null    int64  
 4   LCD                 516 non-null    int64  
 5   LED+LCD복합형          516 non-null    int64  
 6   알뜰형                 516 non-null    int64  
 7   lon                 516 non-null    float64
 8   lat                 516 non-null    float64
 9   중앙차로여부              516 non-null    int64  
 10  운행노선수               516 non-null    int64  
 11  정류장유형수              516 non-null    int64  
 12  전체 승차 건수            516 non-null    float64
 13  초승 건수               516 non-null    float64
 14  환승 건수               516 non-null    float64
 15  전체 하차 건수            516 non-null    float64
 16  하차 건수   