In [1]:
import pandas as pd
import os
import geopy.distance
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [2]:
tpe_station = pd.read_csv('bike_usage_realtime.csv',index_col=0)
nwt_station = pd.read_csv('nwt_ubike_realtime.csv')
tpe_station = tpe_station[['sna','sarea','latitude','longitude']]
tpe_station['sna'] = tpe_station['sna'].apply(lambda x: x.split('_')[1])
tpe_station['sarea'] = tpe_station['sarea'].replace('臺大公館校區','大安區')

tpe_station['city'] = 'TPE'
tpe_station.rename(
    {
        'latitude':'lat',
        'longitude':'lng'
    },axis=1,inplace=True
)
nwt_station = nwt_station[['sna','sarea','lat','lng']]
nwt_station['city']='NWT'

In [3]:
all_station = pd.concat([tpe_station,nwt_station]).reset_index(drop=True)
del tpe_station,nwt_station

In [4]:
#time complexity O(n**2) but it's n(n-1)/2
coords = all_station[['lat', 'lng']].to_numpy()
    
# Initialize a matrix to store distances
dist_matrix = np.zeros((len(coords), len(coords)))

# Compute geodesic distance between each pair of points
for i in tqdm(range(len(coords))):
    for j in range(i + 1, len(coords)):
        dist = geopy.distance.geodesic(coords[i], coords[j]).m
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist  # since distance is symmetric

100%|██████████| 1767/1767 [01:12<00:00, 24.30it/s] 


In [5]:
all_station = all_station.reset_index()
all_station.head()

Unnamed: 0,index,sna,sarea,lat,lng,city
0,0,捷運科技大樓站,大安區,25.02605,121.5436,TPE
1,1,復興南路二段273號前,大安區,25.02565,121.54357,TPE
2,2,國北教大實小東側門,大安區,25.02429,121.54124,TPE
3,3,和平公園東側,大安區,25.02351,121.54282,TPE
4,4,辛亥復興路口西北側,大安區,25.02153,121.54299,TPE


In [6]:
tpe_districts = ['中山區', '松山區', '大同區', '大安區', '中正區', '信義區', '萬華區', '文山區', '南港區',
       '內湖區', '士林區', '北投區']

In [7]:
file_list = os.listdir('./history')
file_list = [file for file in file_list if 'bike_usage_history' in file]
extracted_all = None

In [8]:
for path in file_list:
    ubike_hist = pd.read_csv(Path('./history', path), index_col=0)
    ubike_hist.drop_duplicates(inplace=True)
    ubike_hist.columns = ['lend_time', 'lend_station_name', 'return_time',
                    'return_station_name', 'usage_time', 'source_date']
    ubike_hist = ubike_hist.drop(['lend_time','return_time','usage_time'],axis=1)
    ubike_hist = ubike_hist.merge(all_station,how='inner',
                 left_on='lend_station_name',
                 right_on='sna')
    ubike_hist = ubike_hist.merge(all_station,how='inner',
                 left_on='return_station_name',
                 right_on='sna',suffixes=['_lend','_return'])
    ubike_hist = ubike_hist.drop(['lend_station_name', 'return_station_name',
                 'lat_lend', 'lng_lend',
                 'lat_return', 'lng_return',
                 'sna_lend','sna_return'],axis=1)
    ubike_hist['move_distance'] = ubike_hist.apply(lambda row: dist_matrix[row['index_lend'],row['index_return']],axis=1)
    ubike_hist['sarea_lend'] = ubike_hist['sarea_lend'].replace('臺大公館校區','大安區')
    ubike_hist['sarea_return'] = ubike_hist['sarea_return'].replace('臺大公館校區','大安區')
    ubike_hist = ubike_hist[ubike_hist['sarea_lend'].isin(tpe_districts)]
    extracted = ubike_hist.groupby('index_lend').agg({'move_distance':'mean','index_return':'size'}).reset_index()
    extracted = extracted.rename({'index_return':'traffic_count'},axis=1)
    if extracted_all is None:
        extracted_all = extracted.copy(deep=True)
    else:
        extracted_all = pd.concat([extracted_all,extracted], ignore_index=True)
        print(f"finish processing: {path}")
        print(extracted_all.shape)
        # break

finish processing: 2023_3_bike_usage_history.csv
(2300, 3)
finish processing: 2023_9_bike_usage_history.csv
(3587, 3)
finish processing: 2021_11_bike_usage_history.csv
(4376, 3)
finish processing: 2022_2_bike_usage_history.csv
(5294, 3)
finish processing: 2023_4_bike_usage_history.csv
(6546, 3)
finish processing: 2022_8_bike_usage_history.csv
(7664, 3)
finish processing: 2021_12_bike_usage_history.csv
(8493, 3)
finish processing: 2023_7_bike_usage_history.csv
(9759, 3)
finish processing: 2023_10_bike_usage_history.csv
(11048, 3)
finish processing: 2022_1_bike_usage_history.csv
(11948, 3)
finish processing: 2022_6_bike_usage_history.csv
(13017, 3)
finish processing: 2023_1_bike_usage_history.csv
(14228, 3)
finish processing: 2022_7_bike_usage_history.csv
(15320, 3)
finish processing: 2023_11_bike_usage_history.csv
(16612, 3)
finish processing: 2023_6_bike_usage_history.csv
(17871, 3)
finish processing: 2022_9_bike_usage_history.csv
(19020, 3)
finish processing: 2022_3_bike_usage_history

In [12]:
extracted_all

Unnamed: 0,index_lend,move_distance,traffic_count
0,0,971.821864,9675
1,1,1037.227229,2641
2,2,1157.642653,1293
3,3,1128.482899,2033
4,4,944.137985,2565
...,...,...,...
32740,1407,706.339382,244
32741,1408,572.020329,523
32742,1409,593.568366,1883
32743,1410,629.798351,529


In [26]:
result = extracted_all.groupby('index_lend').apply(lambda df_sub: (df_sub['move_distance']*df_sub['traffic_count']).sum()/df_sub['traffic_count'].sum()).rename('avg_distance').reset_index()

In [27]:
result.head()

Unnamed: 0,index_lend,avg_distance
0,0,925.516234
1,1,994.615146
2,2,997.808214
3,3,992.893592
4,4,868.998602


In [28]:
all_station.head()

Unnamed: 0,index,sna,sarea,lat,lng,city
0,0,捷運科技大樓站,大安區,25.02605,121.5436,TPE
1,1,復興南路二段273號前,大安區,25.02565,121.54357,TPE
2,2,國北教大實小東側門,大安區,25.02429,121.54124,TPE
3,3,和平公園東側,大安區,25.02351,121.54282,TPE
4,4,辛亥復興路口西北側,大安區,25.02153,121.54299,TPE


In [30]:
result = result.merge(all_station,how='inner',left_on='index_lend',right_on='index')

In [32]:
result =result[['sna','sarea','lat','lng','avg_distance']]

In [33]:
result.to_csv('pipeline5_single_variable_radius.csv',index=False)