In [1]:
# lend_station_district	str	
# date	date	
# hour	int	
# day_of_week	str	Monday, Tuesday, Wednesday
# weekend	str	平日、假日
# traffic_count	int	
# usage_time	int	

In [2]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [3]:
tpe_station = pd.read_csv('bike_usage_realtime.csv',index_col=0)
tpe_station = tpe_station[['sna','sarea','latitude','longitude']]
tpe_station['sna'] = tpe_station['sna'].apply(lambda x: x.split('_')[1])
tpe_station['sarea'] = tpe_station['sarea'].replace('臺大公館校區','大安區')


In [4]:
file_list = os.listdir('./history')
file_list = [file for file in file_list if 'bike_usage_history' in file]
extracted_all = None

In [5]:
for path in file_list:

    ubike_hist = pd.read_csv(Path('./history', path), index_col=0)
    ubike_hist.drop_duplicates(inplace=True)
    ubike_hist.columns = ['lend_time', 'lend_station_name', 'return_time',
                    'return_station_name', 'usage_time', 'source_date']
    ubike_hist['lend_hour'] = pd.to_datetime(ubike_hist['lend_time']).dt.hour
    ubike_hist = ubike_hist[['lend_station_name','lend_hour','usage_time','source_date']]
    ubike_hist.loc[:,'usage_time'] = pd.to_timedelta(ubike_hist['usage_time']).dt.total_seconds()
    groupby = ubike_hist.groupby(['lend_station_name','source_date','lend_hour'])
    extracted = groupby['usage_time'].mean().reset_index()
    extracted['traffic_count'] = groupby.size().values
    extracted = extracted.merge(tpe_station,how='inner',left_on='lend_station_name',right_on='sna')
    extracted = extracted.groupby(['sarea','source_date','lend_hour'],
                    as_index=False).agg(
        {'usage_time':'mean',
        'traffic_count':'sum'
        }
        )
    extracted['day_of_week'] = pd.to_datetime(extracted['source_date']).dt.day_name()
    extracted['weekend'] = pd.to_datetime(extracted['source_date']).dt.dayofweek.isin([5,6])
    extracted['weekend'] = np.where(extracted['weekend'],'假日','平日')
    if extracted_all is None:
        extracted_all = extracted.copy(deep=True)
    else:
        extracted_all = pd.concat([extracted_all,extracted], ignore_index=True)
        print(f"finish processing: {path}")
        print(extracted_all.shape)
        # break

finish processing: 2023_3_bike_usage_history.csv
(17826, 7)
finish processing: 2023_9_bike_usage_history.csv
(26465, 7)
finish processing: 2021_11_bike_usage_history.csv
(35085, 7)
finish processing: 2022_2_bike_usage_history.csv
(43095, 7)
finish processing: 2023_4_bike_usage_history.csv
(51732, 7)
finish processing: 2022_8_bike_usage_history.csv
(60659, 7)
finish processing: 2021_12_bike_usage_history.csv
(69291, 7)
finish processing: 2023_7_bike_usage_history.csv
(78219, 7)
finish processing: 2023_10_bike_usage_history.csv
(87146, 7)
finish processing: 2022_1_bike_usage_history.csv
(96040, 7)
finish processing: 2022_6_bike_usage_history.csv
(104669, 7)
finish processing: 2023_1_bike_usage_history.csv
(113595, 7)
finish processing: 2022_7_bike_usage_history.csv
(122523, 7)
finish processing: 2023_11_bike_usage_history.csv
(131159, 7)
finish processing: 2023_6_bike_usage_history.csv
(139799, 7)
finish processing: 2022_9_bike_usage_history.csv
(148430, 7)
finish processing: 2022_3_bike

In [6]:
extracted_all.shape

(278253, 7)

In [7]:
extracted_all.columns

Index(['sarea', 'source_date', 'lend_hour', 'usage_time', 'traffic_count',
       'day_of_week', 'weekend'],
      dtype='object')

In [8]:
extracted_all.rename({
    'sarea':'lend_station_district',
    'source_date':'date',
    'lend_hour':'hour',
    }
    ,axis=1,inplace=True
)

In [9]:
# lend_station_district	str	
# date	date	
# hour	int	
# day_of_week	str	Monday, Tuesday, Wednesday
# weekend	str	平日、假日
# traffic_count	int	
# usage_time	int	

In [15]:
extracted_all['usage_time'] = extracted_all['usage_time'].astype('int')

In [16]:
extracted_all.head()

Unnamed: 0,lend_station_district,date,hour,usage_time,traffic_count,day_of_week,weekend
0,中山區,2022-05-01,0,12221,118,Sunday,假日
1,中山區,2022-05-01,1,1201,58,Sunday,假日
2,中山區,2022-05-01,2,886,49,Sunday,假日
3,中山區,2022-05-01,3,1146,38,Sunday,假日
4,中山區,2022-05-01,4,1003,36,Sunday,假日


In [17]:
extracted_all.to_csv('pipeline_4_v1.csv',index=False)
