In [15]:
import pandas as pd
import os
import geopy.distance
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [16]:
RADIUS = 300

In [18]:
mrt_station = pd.read_csv('mrt_station.csv')

# MRT_station_name: str
# city_code: str
# district: str
# station_lat: decimal
# station_lng: decimal

mrt_station = mrt_station[['StationName.Zh_tw','LocationCityCode', 'LocationTown','StationPosition.PositionLon',
       'StationPosition.PositionLat']]
mrt_station = mrt_station.rename(
    {
        'StationName.Zh_tw':'MRT_station_name',
        'LocationCityCode':'city_code',
        'LocationTown':'district',
        'StationPosition.PositionLat':'station_lat',
        'StationPosition.PositionLon':'station_lng'
    },axis=1)

mrt_station = mrt_station[['MRT_station_name','city_code','district','station_lat','station_lng']]

In [19]:
mrt_station = mrt_station.drop_duplicates(subset='MRT_station_name')

In [20]:
target_stations = mrt_station['MRT_station_name'].to_list()

In [21]:
bike_station = pd.read_csv('TPE_bike_station.csv')
bike_station['district_tw'] = bike_station['district_tw'].replace('臺大公館校區','大安區')
bike_station.rename(
    {
        'name_tw':'station_name',
        'district_tw':'district',
        'lat':'station_lat',
        'lng':'station_lng',
        'parking_spaces':'total'

    }
    ,axis=1,
    inplace=True
                    )

bike_station = bike_station[['station_name','district',
                             'city_code','total','station_lat','station_lng']]

In [22]:
bike_coords = bike_station[['station_lat', 'station_lng']].to_numpy()
mrt_coords = mrt_station[['station_lat', 'station_lng']].to_numpy()

# Initialize a matrix to store distances
dist_matrix = np.zeros((len(mrt_coords), len(bike_coords)))

# Compute geodesic distance between each pair of points
for i in tqdm(range(len(mrt_coords))):
    for j in range(len(bike_coords)):
        dist = geopy.distance.geodesic(mrt_coords[i], bike_coords[j]).m
        dist_matrix[i, j] = dist

100%|██████████| 108/108 [00:07<00:00, 14.79it/s]


In [23]:
ubike_rt = pd.read_csv('ubike_0501.csv',index_col=0)
ubike_rt.drop_duplicates(inplace=True)
ubike_rt['sarea'] = ubike_rt['sarea'].replace('臺大公館校區','大安區')
ubike_rt['sna'] = ubike_rt['sna'].apply(lambda x: x.split('_')[1])


In [24]:
ubike_rt['thres'] = np.min(np.column_stack(((ubike_rt['tot']*0.1).to_numpy(),np.full(len(ubike_rt),5))),axis=1)
def almost_empty(row):
    bike = row['sbi']
    thres = row['thres']
    if bike <thres:
        return True
    else:
        return False

In [25]:
ubike_rt['almost_empty'] = ubike_rt.apply(almost_empty,axis=1)
ubike_rt['date'] = pd.to_datetime(ubike_rt['updateTime']).dt.date
ubike_rt['hour'] = pd.to_datetime(ubike_rt['updateTime']).dt.hour
ubike_rt['min'] = (pd.to_datetime(ubike_rt['updateTime']).dt.minute)//10*10

In [26]:
ubike_rt = ubike_rt.drop(['ar', 'aren', 'infoDate', 'infoTime', 'mday','updateTime',
       'srcUpdateTime','sarea', 'sareaen', 'snaen', 'sno'],axis=1)

In [40]:
result = None
for idx, station in enumerate(target_stations):
    youbike_dist_to_station = dist_matrix[idx]
    youbike_index = np.where(youbike_dist_to_station<RADIUS)[0]
    bike_station_in_area = bike_station.iloc[youbike_index]
    subset = ubike_rt[ubike_rt['sna'].isin(bike_station_in_area['station_name'])]
    subset = subset.drop_duplicates(subset=['sna','date','hour','min'])
    subset = subset.reset_index(drop=True)
    subset = subset.groupby(['date','hour'],as_index=False).agg({'almost_empty':'mean','sna':'first'})
    single_mrt = mrt_station.iloc[idx].to_frame().T
    single_mrt['key'] = 0
    subset['key'] = 0
    single_mrt = single_mrt.merge(subset,on='key',how='inner')
    if result is None:
        result = single_mrt.copy(deep=True)
    else:
        result = pd.concat([result,single_mrt],ignore_index=True)
    

In [41]:
result.columns

Index(['MRT_station_name', 'city_code', 'district', 'station_lat',
       'station_lng', 'key', 'date', 'hour', 'almost_empty', 'sna'],
      dtype='object')

In [43]:
result = result[['sna','MRT_station_name','date','hour','almost_empty']]

In [44]:
result.to_csv(f'pipeline_F5_R{RADIUS}_result.csv',index=False)

In [45]:
result.shape

(22776, 5)