In [1]:
import pandas as pd
import os
import geopy.distance
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [2]:
RADIUS = 300

In [3]:
mrt_station = pd.read_csv('../MRT/mrt_station.csv')



In [4]:
# MRT_station_name: str
# city_code: str
# district: str
# station_lat: decimal
# station_lng: decimal

mrt_station = mrt_station[['StationName.Zh_tw','LocationCityCode', 'LocationTown','StationPosition.PositionLon',
       'StationPosition.PositionLat']]
mrt_station = mrt_station.rename(
    {
        'StationName.Zh_tw':'MRT_station_name',
        'LocationCityCode':'city_code',
        'LocationTown':'district',
        'StationPosition.PositionLat':'station_lat',
        'StationPosition.PositionLon':'station_lng'
    },axis=1)

mrt_station = mrt_station[['MRT_station_name','city_code','district','station_lat','station_lng']]

In [5]:
mrt_station = mrt_station.drop_duplicates(subset='MRT_station_name')

In [6]:
target_stations = ['民權西路','中山','台北車站','中正紀念堂',
      '東門','大安','西門','忠孝新生','忠孝復興',
      '南京復興','松江南京','古亭','西湖','港墘','南港']

In [7]:
len(target_stations)

15

In [8]:
mrt_station = mrt_station[mrt_station['MRT_station_name'].isin(target_stations)]

In [9]:
target_stations = mrt_station['MRT_station_name'].to_list()

In [10]:
bike_station = pd.read_csv('../bike/bike_usage_realtime.csv',index_col=0)
bike_station['sarea'] = bike_station['sarea'].replace('臺大公館校區','大安區')
bike_station = bike_station.rename(
    {
        'sna':'Bike_station_name',
        'sarea':'district',
        'latitude':'station_lat',
        'longitude':'station_lng'

    }
    ,axis=1)
bike_station['city_code']='TPE'

bike_station['Bike_station_name'] = bike_station['Bike_station_name'].apply(lambda x: x.split('_')[1])
bike_station = bike_station[['Bike_station_name','district',
                             'city_code','total','station_lat','station_lng']]

In [11]:
bike_coords = bike_station[['station_lat', 'station_lng']].to_numpy()
mrt_coords = mrt_station[['station_lat', 'station_lng']].to_numpy()

# Initialize a matrix to store distances
dist_matrix = np.zeros((len(mrt_coords), len(bike_coords)))

# Compute geodesic distance between each pair of points
for i in tqdm(range(len(mrt_coords))):
    for j in range(len(bike_coords)):
        dist = geopy.distance.geodesic(mrt_coords[i], bike_coords[j]).m
        dist_matrix[i, j] = dist

100%|██████████| 15/15 [00:01<00:00, 13.66it/s]


In [12]:
ubike_rt = pd.read_csv('ubike_0501.csv',index_col=0)
ubike_rt.drop_duplicates(inplace=True)
ubike_rt['sarea'] = ubike_rt['sarea'].replace('臺大公館校區','大安區')
ubike_rt['sna'] = ubike_rt['sna'].apply(lambda x: x.split('_')[1])


In [13]:
ubike_rt['thres'] = np.min(np.column_stack(((ubike_rt['tot']*0.1).to_numpy(),np.full(len(ubike_rt),5))),axis=1)
def almost_empty(row):
    bike = row['sbi']
    thres = row['thres']
    if bike <2:
        return True
    else:
        return False

In [14]:
ubike_rt['almost_empty'] = ubike_rt.apply(almost_empty,axis=1)
ubike_rt['is_empty'] = ubike_rt['sbi']==0
ubike_rt['date'] = pd.to_datetime(ubike_rt['updateTime']).dt.date
ubike_rt['hour'] = pd.to_datetime(ubike_rt['updateTime']).dt.hour
ubike_rt['min'] = (pd.to_datetime(ubike_rt['updateTime']).dt.minute)//10*10

In [15]:
ubike_rt = ubike_rt.drop(['ar', 'aren', 'infoDate', 'infoTime', 'mday','updateTime',
       'srcUpdateTime','sarea', 'sareaen', 'snaen', 'sno'],axis=1)

In [16]:
for idx, station in enumerate(target_stations):
    youbike_dist_to_station = dist_matrix[idx]
    youbike_index = np.where(youbike_dist_to_station<RADIUS)[0]
    print(station,bike_station.iloc[youbike_index].shape)
    # if idx==0:
    #     break

西門 (7, 6)
台北車站 (4, 6)
忠孝新生 (9, 6)
忠孝復興 (7, 6)
南港 (7, 6)
大安 (7, 6)
南京復興 (8, 6)
西湖 (7, 6)
港墘 (7, 6)
古亭 (9, 6)
中正紀念堂 (4, 6)
中山 (3, 6)
松江南京 (11, 6)
東門 (4, 6)
民權西路 (7, 6)


In [17]:
result = None
for idx, station in enumerate(target_stations):
    youbike_dist_to_station = dist_matrix[idx]
    youbike_index = np.where(youbike_dist_to_station<RADIUS)[0]
    bike_station_in_area = bike_station.iloc[youbike_index]
    subset = ubike_rt[ubike_rt['sna'].isin(bike_station_in_area['Bike_station_name'])]
    subset = subset.drop_duplicates(subset=['sna','date','hour','min'])
    subset = subset.reset_index(drop=True)
    subset = subset.groupby(['date','hour','min'],as_index=False)[['almost_empty','is_empty']].mean()
    single_mrt = mrt_station.iloc[idx].to_frame().T
    single_mrt['key'] = 0
    subset['key'] = 0
    single_mrt = single_mrt.merge(subset,on='key',how='outer')
    if result is None:
        result = single_mrt.copy(deep=True)
    else:
        result = pd.concat([result,single_mrt],ignore_index=True)
    

In [19]:
import datetime

In [20]:
def create_dt_from_row(row):
    year = row['date'].year
    month = row['date'].month
    day = row['date'].day
    return datetime.datetime(year,month,day,row['hour'],row['min'])

In [21]:
result['timestamp'] = result.apply(lambda row:create_dt_from_row(row),axis=1)

In [23]:
result.to_csv(f'pipeline_8_R{RADIUS}_0_or_1.csv',index=False)