In [1]:
# Import packages
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
import datetime
from datetime import datetime, timedelta
from shapely.geometry import Point
from shapely.geometry import shape
from statsmodels.tsa.arima.model import ARIMA
from keras.models import Sequential
from keras.layers import Dense
import xgboost as xgb
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MinMaxScaler
import libpysal as ps
from spreg import GM_Lag
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
import os
from zipfile import ZipFile
import random
import networkx as nx
import warnings

In [2]:
# Open the zip file in read-only mode.

DC_boundary = gpd.read_file('Data/Shapefile/Washington_DC_Boundary.geojson')

def update_datetime(row):
    if int(row[11:13]) >= 24:
        return datetime.strptime(row[:11] + str(int(row[11:13])%24) + row[13:], '%Y-%m-%d %H:%M:%S') + timedelta(days = int(row[11:13])//24) # Increment the day
    else:
        return datetime.strptime(row, '%Y-%m-%d %H:%M:%S')

directory = "Data/GTFS/"
file_paths = []
# crawling through directory and subdirectories
for root, directories, files in os.walk(directory):
    for filename in files:
        # join the two strings in order to form the full filepath.
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)

date_list = []
for i in range (len(file_paths)):
    # Extract the date from the file name
    date_str = os.path.splitext(os.path.basename(file_paths[i]))[0].split('_')[1]
    # Convert the date string to datetime object
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    # Format the datetime object as 'YYYY-MM-DD'
    date_list.append(date_obj.strftime('%Y-%m-%d') + ' ')

transit_stops_info = []
for i in range (77):
    archive = ZipFile(file_paths[i], "r")
    trips = pd.read_csv(archive.open('trips.txt'))
    stops = pd.read_csv(archive.open('stops.txt'))
    ## add geometry for stops
    stops['geometry'] = stops[['stop_lon', 'stop_lat']].values.tolist()
    stops['geometry'] = stops['geometry'].apply(Point)
    # select stops within DC
    DC_stop_index = []
    for j in range (len(stops)):
        if stops.geometry[j].within(DC_boundary.geometry[0]):
            DC_stop_index.append(j)
    stops = stops.loc[DC_stop_index]
    stops_time = pd.read_csv(archive.open('stop_times.txt'))
    ## merge
    transit_stops_time = stops_time.merge(stops, how = 'left', on = "stop_id")
    transit_stops_time = transit_stops_time.merge(trips, how = 'left', on = "trip_id")
    # select trip id within DC
    transit_stops_time = transit_stops_time.iloc[np.isin(transit_stops_time['stop_id'], stops.stop_id.unique())]
    # delete unnecessary columns
    transit_stops_time = transit_stops_time.drop(columns = {"trip_id", "stop_sequence", "pickup_type", "drop_off_type", "shape_dist_traveled", "route_id", "service_id", "stop_code", "stop_desc", "stop_name", "zone_id", "block_id", "scheduled_trip_id", "trip_headsign", "direction_id", "shape_id"}, axis = 1)
    # add date information for the dataframe
    transit_stops_time.arrival_time = date_list[i] + transit_stops_time.arrival_time
    transit_stops_time.departure_time = date_list[i] + transit_stops_time.departure_time
    transit_stops_info.append(transit_stops_time)
    print(i)

transit_stops_info = pd.concat(transit_stops_info, ignore_index = True)
transit_stops_info["arrival_time"] = transit_stops_info["arrival_time"].apply(update_datetime)
transit_stops_info["arrival_time"] = pd.to_datetime(transit_stops_info["arrival_time"])
transit_stops_info["departure_time"] = transit_stops_info["departure_time"].apply(update_datetime)
transit_stops_info["departure_time"] = pd.to_datetime(transit_stops_info["departure_time"])
print(len(transit_stops_info))
transit_stops_info.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
141256194


Unnamed: 0,arrival_time,departure_time,stop_id,stop_lat,stop_lon,geometry
0,2019-01-01 06:58:06,2019-01-01 06:58:06,5570,38.886957,-77.031738,POINT (-77.031738 38.886957)
1,2019-01-01 06:58:54,2019-01-01 06:58:54,7718,38.888462,-77.031799,POINT (-77.031799 38.888462)
2,2019-01-01 07:01:24,2019-01-01 07:01:24,30097,38.894203,-77.031794,POINT (-77.031794 38.894203)
3,2019-01-01 07:02:12,2019-01-01 07:02:12,27264,38.89597,-77.03182,POINT (-77.03182 38.89597)
4,2019-01-01 07:02:48,2019-01-01 07:02:48,5249,38.897189,-77.031821,POINT (-77.031821 38.897189)


In [3]:
transit_stops_info.to_csv("Data/transit_stops_info_part1.csv", index = False)