In [3]:
# lend_city	str	
# return_city	str	
# lend_district	str	
# return_district	str	
# move_distance	 int	單位公尺
# traffic_count int
# source_date	date	租借日期

In [4]:
import pandas as pd
import os
import geopy.distance
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [5]:
tpe_station = pd.read_csv('bike_usage_realtime.csv',index_col=0)
nwt_station = pd.read_csv('nwt_ubike_realtime.csv')
tpe_station = tpe_station[['sna','sarea','latitude','longitude']]
tpe_station['sna'] = tpe_station['sna'].apply(lambda x: x.split('_')[1])
tpe_station['city'] = 'TPE'
tpe_station.rename(
    {
        'latitude':'lat',
        'longitude':'lng'
    },axis=1,inplace=True
)
nwt_station = nwt_station[['sna','sarea','lat','lng']]
nwt_station['city']='NWT'

In [6]:
all_station = pd.concat([tpe_station,nwt_station]).reset_index(drop=True)
del tpe_station,nwt_station

In [7]:
#time complexity O(n**2) but it's n(n-1)/2
coords = all_station[['lat', 'lng']].to_numpy()
    
# Initialize a matrix to store distances
dist_matrix = np.zeros((len(coords), len(coords)))

# Compute geodesic distance between each pair of points
for i in tqdm(range(len(coords))):
    for j in range(i + 1, len(coords)):
        dist = geopy.distance.geodesic(coords[i], coords[j]).m
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist  # since distance is symmetric

100%|██████████| 1767/1767 [01:14<00:00, 23.68it/s] 


In [8]:
all_station = all_station.reset_index()
all_station.head()

Unnamed: 0,index,sna,sarea,lat,lng,city
0,0,捷運科技大樓站,大安區,25.02605,121.5436,TPE
1,1,復興南路二段273號前,大安區,25.02565,121.54357,TPE
2,2,國北教大實小東側門,大安區,25.02429,121.54124,TPE
3,3,和平公園東側,大安區,25.02351,121.54282,TPE
4,4,辛亥復興路口西北側,大安區,25.02153,121.54299,TPE


In [9]:
tpe_districts = ['中山區', '松山區', '大同區', '大安區', '中正區', '信義區', '萬華區', '文山區', '南港區',
       '內湖區', '士林區', '北投區']

In [10]:
file_list = os.listdir('./history')
file_list = [file for file in file_list if 'bike_usage_history' in file]
extracted_all = None

In [14]:
avg_dist_25_35_min = []
avg_dist_4_hour_plus = []
avg_dist_4_hour_plus_exclude_same_district_return = []
all_distance = []
for path in file_list:
    print(f'analyzing {path} .....')
    ubike_hist = pd.read_csv(Path('./history', path), index_col=0)
    ubike_hist.drop_duplicates(inplace=True)
    ubike_hist.columns = ['lend_time', 'lend_station_name', 'return_time',
                    'return_station_name', 'usage_time', 'source_date']
    ubike_hist = ubike_hist.drop(['lend_time','return_time'],axis=1)
    ubike_hist = ubike_hist.merge(all_station,how='inner',
                 left_on='lend_station_name',
                 right_on='sna')
    ubike_hist = ubike_hist.merge(all_station,how='inner',
                 left_on='return_station_name',
                 right_on='sna',suffixes=['_lend','_return'])
    ubike_hist = ubike_hist.drop(['lend_station_name', 'return_station_name',
                 'lat_lend', 'lng_lend',
                 'lat_return', 'lng_return',
                 'sna_lend','sna_return'],axis=1)
    ubike_hist['move_distance'] = ubike_hist.apply(lambda row: dist_matrix[row['index_lend'],row['index_return']],axis=1)
    ubike_hist['sarea_lend'] = ubike_hist['sarea_lend'].replace('臺大公館校區','大安區')
    ubike_hist['sarea_return'] = ubike_hist['sarea_return'].replace('臺大公館校區','大安區')
    ubike_hist['usage_time'] = pd.to_timedelta(ubike_hist['usage_time']).dt.total_seconds()
    all_distance.append(ubike_hist['move_distance'].mean())
    print('25~35 minutes average distance:')
    avg_dist_25_35_min.append(ubike_hist[ubike_hist['usage_time'].between(60*25,60*35)]['move_distance'].mean())
    print(avg_dist_25_35_min[-1])
    print(ubike_hist[ubike_hist['usage_time'].between(60*25,60*35)].shape)
    print('4+ hours average distance:')
    avg_dist_4_hour_plus.append(ubike_hist[ubike_hist['usage_time']>60*60*4]['move_distance'].mean())
    print(avg_dist_4_hour_plus[-1])
    print(ubike_hist[ubike_hist['usage_time']>60*60*4].shape)
    print('4+ hours average distance (exclude same district_return):')
    avg_dist_4_hour_plus_exclude_same_district_return.append(ubike_hist[(ubike_hist['usage_time']>60*60*4) & (ubike_hist['sarea_lend']!=ubike_hist['sarea_return'])]['move_distance'].mean())
    print(avg_dist_4_hour_plus_exclude_same_district_return[-1])
    print('========================')
    
    break
    # ubike_hist = ubike_hist[ubike_hist['sarea_lend'].isin(tpe_districts)]
    # groupby = ubike_hist.groupby(by=['sarea_lend','sarea_return','source_date'])
    # extracted = groupby.agg(
    #     {'move_distance':'mean',
    #     'city_lend':'first',
    #     'city_return':'first',
    #     }
    # ).reset_index()
    # extracted['traffic_count'] = groupby.size().values
    # extracted.rename(
    #     {
    #         'sarea_lend':'lend_district', 
    #         'sarea_return':'return_district', 
    #     'city_lend':'lend_city', 
    #     'city_return':'return_city', 
    #     },axis=1,inplace=True
    # )
    # if extracted_all is None:
    #     extracted_all = extracted.copy(deep=True)
    # else:
    #     extracted_all = pd.concat([extracted_all,extracted], ignore_index=True)
    #     print(f"finish processing: {path}")
    #     # print(extracted_all.shape)
    #     print(extracted_all['move_distance'].min())
    #     # break

analyzing 2022_5_bike_usage_history.csv .....
25~35 minutes average distance:
1872.6391863803224
(117855, 9)
4+ hours average distance:
2047.0348674015318
(3180, 9)
4+ hours average distance (exclude same district_return):
5791.983959312853


In [29]:
import numpy as np

In [31]:
np.median(avg_dist_25_35_min)

1877.9027350907236

In [33]:
np.median(avg_dist_4_hour_plus_exclude_same_district_return)

5385.6723535982765

In [40]:
# extracted_nwt.to_csv('pipeline_3_1_v1.csv',index=False)
# extracted_tpe.to_csv('pipeline_3_2_v1.csv',index=False)


In [34]:
np.median(avg_dist_4_hour_plus)

1795.6785794327434