In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv('2023_11_bike_usage_history.csv',index_col=0)

In [3]:
df.columns=['lend_time','lend_station_name','return_time','return_station_name','usage_time','source_date']
df['source_date'] = pd.to_datetime(df['source_date'])
df['lend_date'] = pd.to_datetime(df['lend_time']).dt.date
df['lend_hour'] = pd.to_datetime(df['lend_time']).dt.hour
df['return_date'] = pd.to_datetime(df['return_time']).dt.date
df['return_hour'] = pd.to_datetime(df['return_time']).dt.hour
df = df.drop(['lend_time','return_time'],axis=1)
df['usage_time'] = pd.to_timedelta(df['usage_time']).dt.total_seconds().astype('int')

In [4]:
tpe_stations = pd.read_csv('TPE_bike_station.csv')
tpe_stations = tpe_stations[['station_no','name_tw','district_tw']]

In [6]:
# average daily use (weekday)
# average daily use (weekend)


In [7]:
df['day_of_week'] = pd.to_datetime(df['source_date']).dt.weekday

In [8]:
weekday = df[~df['day_of_week'].isin([5,6])]
weekday = pd.concat([weekday['lend_station_name'],weekday['return_station_name']]).rename('station').value_counts().to_frame().reset_index()
weekday = weekday[weekday['station'].isin(tpe_stations['name_tw'])]

In [9]:
weekend = df[~df['day_of_week'].isin(range(0,5))]
weekend = pd.concat([weekend['lend_station_name'],weekend['return_station_name']]).rename('station').value_counts().to_frame().reset_index()
weekend = weekend[weekend['station'].isin(tpe_stations['name_tw'])]

In [10]:
weekday['count'] = weekday['count']/df['source_date'].nunique()
weekend['count'] = weekend['count']/df['source_date'].nunique()


In [11]:
avg_use = pd.merge(weekday,weekend,how='outer',on='station',suffixes=['_weekday','_weekend'])
avg_use = avg_use.fillna(0)
del weekday,weekend

In [12]:
# Peak Usage Times top 5 (weekday)
# Peak Usage Times top 5 (weekend)

In [13]:
weekday = df[~df['day_of_week'].isin([5,6])]

In [14]:
col_name = ['station','hour']
weekday = pd.concat([
    weekday[['lend_station_name','lend_hour']].set_axis(col_name,axis=1),
    weekday[['return_station_name','return_hour']].set_axis(col_name,axis=1),
])

In [15]:
weekday_traffic_counts = weekday.groupby(['station', 'hour']).value_counts().reset_index(name='counts')


In [16]:
weekday_traffic_counts = weekday_traffic_counts.groupby('station', group_keys=False)\
    .apply(lambda x: x.sort_values(by='counts', ascending=False).head(3))

In [17]:
weekday_traffic_counts['rank'] = weekday_traffic_counts.groupby('station')['counts'].rank(method='first', ascending=False)

# Pivot the DataFrame to wide format
weekday_pivot_df = weekday_traffic_counts.pivot(index='station', columns='rank', values=['hour', 'counts'])

In [18]:
weekday_pivot_df.columns = [f'{col[0]}_top{int(col[1])}' for col in weekday_pivot_df.columns]
weekday_pivot_df.reset_index(inplace=True)

In [19]:
weekend = df[~df['day_of_week'].isin(range(0,5))]
col_name = ['station','hour']
weekend = pd.concat([
    weekend[['lend_station_name','lend_hour']].set_axis(col_name,axis=1),
    weekend[['return_station_name','return_hour']].set_axis(col_name,axis=1),
])
weekend_traffic_counts = weekend.groupby(['station', 'hour']).value_counts().reset_index(name='counts')
weekend_traffic_counts = weekend_traffic_counts.groupby('station', group_keys=False)\
    .apply(lambda x: x.sort_values(by='counts', ascending=False).head(3))
weekend_traffic_counts['rank'] = weekend_traffic_counts.groupby('station')['counts'].rank(method='first', ascending=False)

# Pivot the DataFrame to wide format
weekend_pivot_df = weekend_traffic_counts.pivot(index='station', columns='rank', values=['hour', 'counts'])
weekend_pivot_df.columns = [f'{col[0]}_top{int(col[1])}' for col in weekend_pivot_df.columns]
weekend_pivot_df.reset_index(inplace=True)

In [20]:
pivot_data = pd.merge(weekday_pivot_df,weekend_pivot_df,how='outer',on='station',suffixes=['_weekday','_weekend'])
del weekday,weekend,weekday_pivot_df,weekend_pivot_df

In [21]:
pivot_data = pivot_data[pivot_data['station'].isin(tpe_stations['name_tw'])]

In [22]:
avg_use.shape,pivot_data.shape

((1294, 3), (1294, 13))

In [102]:
# Usage Variability (daily) 變異係數
# Usage Variability (hourly) 變異係數
# Usage Variability (weekend/weekday) 變異係數
col_name = ['station','hour','source_date']
all_access = pd.concat([
    df[['lend_station_name','lend_hour','source_date']].set_axis(col_name,axis=1),
    df[['return_station_name','return_hour','source_date']].set_axis(col_name,axis=1),
])

In [171]:
# groupby_hour
groupby_hour = all_access.groupby(['station','hour']).size().reset_index(name='traffic_count')
groupby_hour = groupby_hour.pivot(index='station',columns='hour',values='traffic_count').fillna(0).astype('int')
groupby_hour['cv'] = np.std(groupby_hour.to_numpy(),axis=1)/np.mean(groupby_hour.to_numpy(),axis=1)
hourly_cv = groupby_hour.reset_index()[['station','cv']]
hourly_cv.columns.name=None
del groupby_hour
hourly_cv.head()


Unnamed: 0,station,cv
0,3樓客服中心,2.828427
1,?公公園,0.674676
2,?寮公園,4.795832
3,一壽橋,0.761308
4,一江公園,0.768149


In [172]:
groupby_date = all_access.groupby(['station','source_date']).size().reset_index(name='traffic_count')
groupby_date = groupby_date.pivot(index='station',columns='source_date',values='traffic_count').fillna(0).astype('int')
groupby_date['cv'] = np.std(groupby_date.to_numpy(),axis=1)/np.mean(groupby_date.to_numpy(),axis=1)
daily_cv = groupby_date.reset_index()[['station','cv']]
daily_cv.columns.name=None
del groupby_date
daily_cv.head()


Unnamed: 0,station,cv
0,3樓客服中心,3.201562
1,?公公園,0.203533
2,?寮公園,5.385165
3,一壽橋,0.312968
4,一江公園,0.257237


In [173]:
all_access['day_of_week'] = pd.to_datetime(all_access['source_date']).dt.weekday
all_access['weekday_weekend'] = np.where(all_access['day_of_week'].isin(range(0,5)),'weekday','weekend')

In [175]:
groupby_weekday_weekend = all_access.groupby(['station','weekday_weekend']).size().reset_index(name='traffic_count')
groupby_weekday_weekend = groupby_weekday_weekend.pivot(index='station',columns='weekday_weekend',values='traffic_count').fillna(0).astype('int')
groupby_weekday_weekend['cv'] = np.std(groupby_weekday_weekend.to_numpy(),axis=1)/np.mean(groupby_weekday_weekend.to_numpy(),axis=1)
daily_cv = groupby_weekday_weekend.reset_index()[['station','cv']]
daily_cv.columns.name=None
del groupby_weekday_weekend
daily_cv.head()


Unnamed: 0,station,cv
0,3樓客服中心,0.5
1,?公公園,0.433641
2,?寮公園,1.0
3,一壽橋,0.52873
4,一江公園,0.648204


In [98]:
# Duration of Use (weekday)
# Duration of Use (weekend)
weekday_duration = df[~df['day_of_week'].isin([5,6])].groupby('lend_station_name')['usage_time'].mean().reset_index()
weekend_duration = df[~df['day_of_week'].isin(range(0,5))].groupby('lend_station_name')['usage_time'].mean().reset_index()
duration_data = pd.merge(weekday_duration,weekend_duration,how='outer',on='lend_station_name',suffixes=['_weekday','_weekend'])
duration_data = duration_data.rename({'lend_station_name':'station'},axis=1)
duration_data.head()

Unnamed: 0,station,usage_time_weekday,usage_time_weekend
0,3樓客服中心,478.333333,589.0
1,?公公園,1162.664787,1189.630844
2,一壽橋,1729.235751,2475.45082
3,一江公園,1001.648606,1125.544118
4,三張犁,1000.225486,1102.718121


In [None]:
# percentage of top3 linking stations

In [25]:
# Usage Variability (Seasonality) 目前只有一個月沒辦法有這個資料


In [None]:
#Turnover Rate (不知道怎麼算？

### next nearest station distance

In [81]:
# next nearest station distance
import geopy.distance
tpe_stations = pd.read_csv('TPE_bike_station.csv')
tpe_stations

Unnamed: 0,station_no,name_tw,district_tw,address_tw,parking_spaces,lat,lng,city_code
0,500101001,捷運科技大樓站,大安區,復興南路二段235號前,28,25.02605,121.54360,TPE
1,500101002,復興南路二段273號前,大安區,復興南路二段273號西側,21,25.02565,121.54357,TPE
2,500101003,國北教大實小東側門,大安區,和平東路二段96巷7號,16,25.02429,121.54124,TPE
3,500101004,和平公園東側,大安區,和平東路二段118巷33號,11,25.02351,121.54282,TPE
4,500101005,辛亥復興路口西北側,大安區,復興南路二段368號,16,25.02153,121.54299,TPE
...,...,...,...,...,...,...,...,...
1408,500119087,臺大總圖書館西南側,臺大公館校區,臺大圖書館西南側,30,25.01690,121.54031,TPE
1409,500119088,臺大黑森林西側,臺大公館校區,臺大霖澤館南側,20,25.01995,121.54347,TPE
1410,500119089,臺大獸醫館南側,臺大公館校區,臺大獸醫系館南側,24,25.01791,121.54242,TPE
1411,500119090,臺大新體育館東南側,臺大公館校區,臺大體育館東側,40,25.02112,121.53591,TPE


In [82]:
def find_closest_distance(row,base_point):
    other_point = row[['lat','lng']].to_numpy()
    return geopy.distance.geodesic(base_point,other_point).m
    

In [83]:
#time complexity O(n**2)
# distance = []
# for idx,row in tqdm(tpe_stations.iterrows()):
#     point_A = row[['lat','lng']].to_numpy()
#     distance_list = tpe_stations.apply(find_closest_distance,base_point=point_A,axis=1)
#     closest_distance = distance_list.sort_values().values[1]
#     distance.append(closest_distance)

In [84]:
#time complexity O(n**2) but it's n(n-1)/2
coords = tpe_stations[['lat', 'lng']].to_numpy()
    
# Initialize a matrix to store distances
dist_matrix = np.zeros((len(coords), len(coords)))

# Compute geodesic distance between each pair of points
for i in tqdm(range(len(coords))):
    for j in range(i + 1, len(coords)):
        dist = geopy.distance.geodesic(coords[i], coords[j]).m
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist  # since distance is symmetric

# Find the minimum distance for each point, ignoring the zero (self-distance)
# closest_distances = np.min(np.where(dist_matrix > 0, dist_matrix, np.inf), axis=1)
# return closest_distances

100%|██████████| 1413/1413 [00:48<00:00, 29.36it/s] 


In [85]:
closest_distances = np.min(np.where(dist_matrix > 0, dist_matrix, np.inf), axis=1)

In [86]:
tpe_stations['closest_distances'] = closest_distances

In [87]:
# number of station in radius
tpe_stations['R500_station_count'] = np.sum(dist_matrix<500,axis=0)-1

### 用即時資料萃取

In [None]:
# 缺車/正常/缺位比例 （缺車風險）還沒看風險怎麼算 先以即時資料的比例來看


In [99]:
# delta value percentile
df

Unnamed: 0,lend_station_name,return_station_name,usage_time,source_date,lend_date,lend_hour,return_date,return_hour,day_of_week
0,捷運南京復興站(1號出口),松江路77巷口,516,2023-11-22,2023-11-22,18,2023-11-22,18,2
1,河堤國小,捷運古亭站(2號出口),16665,2023-11-22,2023-11-22,12,2023-11-22,17,2
2,萬華國中_1,和平金山路口,1161,2023-11-22,2023-11-22,11,2023-11-22,12,2
3,萬華國中_1,萬華國中_1,1491,2023-11-22,2023-11-22,11,2023-11-22,12,2
4,木新公園,樟樹公園,26265,2023-11-22,2023-11-22,14,2023-11-22,21,2
...,...,...,...,...,...,...,...,...,...
2941060,長春吉林路口,中原民生路口,864,2023-11-01,2023-11-01,23,2023-11-01,23,2
2941061,捷運松江南京站(7號出口),捷運行天宮站(3號出口),419,2023-11-01,2023-11-01,18,2023-11-01,18,2
2941062,新生德惠街口,捷運中山國小站(4號出口),205,2023-11-01,2023-11-01,20,2023-11-01,20,2
2941063,捷運中山站(2號出口),中山農安街口,977,2023-11-01,2023-11-01,18,2023-11-01,18,2
