In [1]:
#########################################
# Run once - setup functions            #
# Download .zip files to data folder    #
#########################################
import os
import time

import numpy as np
import simplejson, urllib
import pandas as pd
import parameter
import query


In [1]:
years = [str(year) for year  in range(2014, 2025)]
months = [str(month) for month in range(1, 13)]
months_name = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

if not os.path.exists("data/"):
  os.makedirs("data/")
  
for year in years:
  path = "data/" + year + "-citibike-tripdata"
  if os.path.exists(path):
    os.rename(path, "data/" + year + "/")
    print(f"Renaming {year}-citibike-tripdata")
  for month in months:
    path = "data/" + year + "/" + month + "_" + months_name[int(month)-1] + "/"
    if os.path.exists(path):
      new_path = "data/" + year + "/" + month + "/"
      os.rename(path, new_path)
      for n in range(1, len(os.listdir(new_path))+1):
        path = new_path + str(int(year)*100+int(month)) + "-citibike-tripdata_" + str(n) + ".csv"
        if os.path.exists(path):
          new_path = new_path + str(n) + ".csv"
          os.rename(path, new_path)          

In [86]:
df = pd.read_csv('data/2023/12/1.csv', low_memory=False)
stations = df['start_station_name'].unique()
pd.DataFrame(stations, columns=['station_name']).to_csv('data/stations.csv')

locations = []
for station in stations:
  if station in df['start_station_name']:
    lat = df.loc[df['start_station_name'] == station]['start_lat'].iloc[0]
    lng = df.loc[df['start_station_name'] == station]['start_lng'].iloc[0]
    locations.append([station, lat, lng])
    print(station, lat, lng)
new_df = pd.DataFrame(locations, columns=['station_name', 'lat', 'lng'])
new_df.to_csv('data/station_locations.csv')

In [2]:
API_KEY = '' # PRIVATE

station_locations = pd.read_csv('data/station_information.csv')
stations = pd.read_csv('data/stations.csv', index_col=0)
stations = stations['station_name'][:10]
done = {station: {} for station in stations}

for orig_station in stations[:5]:
  station_info = station_locations.loc[station_locations['station_name'] == orig_station]
  orig_lat = station_info['lat'].iloc[0]
  orig_lng = station_info['lng'].iloc[0]
  for dest_station in stations:
    if orig_station in done[dest_station]:
      done[orig_station][dest_station] = done[dest_station][orig_station]
    else:
      station_info = station_locations.loc[station_locations['station_name'] == dest_station]
      dest_lat = station_info['lat'].iloc[0]
      dest_lng = station_info['lng'].iloc[0]
      url = f"https://maps.googleapis.com/maps/api/distancematrix/json?origins={orig_lat},{orig_lng}&destinations={dest_lat},{dest_lng}&mode=biking&language=en-EN&key={API_KEY}"
      result = simplejson.load(urllib.request.urlopen(url))
      if 'duration' in result['rows'][0]['elements'][0]:
        trip_time = result['rows'][0]['elements'][0]['duration']['value']
      else:
        trip_time = np.nan
      done[orig_station][dest_station] = trip_time
          
df = pd.DataFrame(done)
df.to_csv('data/test_dist.csv')
    
    

In [2]:
years = [str(year) for year  in range(2014, 2025)]
months = [str(month) for month in range(1, 13)]
year = '2023'
stations = pd.read_csv('data/stations.csv')
# columns = ['ride_id', 'rideable_type', 'started_at', 'ended_at',
#        'start_station_name', 'start_station_id', 'end_station_name',
#        'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
#        'member_casual']
start = 0 # stopped somewhere around here 
end = len(stations)
window_size = 20

while start < end:
  total_time = time.time()
  end_window = start + window_size
  if start + window_size > end:
    end_window = end
  stations_list = []
  for month in months:
    print('Loading month', month)
    path = f'data/{year}/{month}/'
    for file in os.listdir(path):
      if file.endswith(".csv"):
        now = time.time()
        df = pd.read_csv(path + file, low_memory=False)
        print('Loading dataframe took: ', round(time.time() - now, 4))
        unique_stations = df['start_station_name'].unique()
        stations_to_add = stations['station_name'][start:end_window]
        stations_list.append(df.loc[df['start_station_name'].isin(stations_to_add)])
  now = time.time()
  stations_df = pd.concat(stations_list, ignore_index=True, sort=False)     
  print('Concat took,', round(time.time() - now, 4))
  print("Length of stations_df: ", len(stations_df['start_station_name']))
  if not os.path.exists(f'data/{year}/by_station'):
    os.makedirs(f'data/{year}/by_station')
  print('Saving stations,', stations_df['start_station_name'].unique())
  for station in stations_df['start_station_name'].unique():
    to_save = stations_df.loc[stations_df['start_station_name'] == station].reset_index(drop=True)
    to_save.to_csv(f'data/{year}/by_station/{station}.csv')
  start = end_window
  print('Loop took,', round(time.time() - total_time, 4))

Loading month 1
Loading dataframe took:  4.032
Loading dataframe took:  3.4642
Loading month 2
Loading dataframe took:  4.1055
Loading dataframe took:  2.8265
Loading month 3
Loading dataframe took:  3.9943
Loading dataframe took:  0.7627
Loading dataframe took:  3.5195
Loading month 4
Loading dataframe took:  4.0008
Loading dataframe took:  3.669
Loading dataframe took:  4.0454
Loading month 5
Loading dataframe took:  2.2882
Loading dataframe took:  3.9855
Loading dataframe took:  3.929
Loading dataframe took:  4.1385
Loading month 6
Loading dataframe took:  2.2014
Loading dataframe took:  3.901
Loading dataframe took:  3.9956
Loading dataframe took:  3.8385
Loading month 7
Loading dataframe took:  2.925
Loading dataframe took:  3.9438
Loading dataframe took:  3.9
Loading dataframe took:  3.8278
Loading month 8
Loading dataframe took:  0.4421
Loading dataframe took:  3.9283
Loading dataframe took:  4.1051
Loading dataframe took:  3.9041
Loading dataframe took:  4.0603
Loading month 9


In [20]:
start_date = query.get_datetime(2023, 5, 1, 0, 0, 0)
end_date = query.get_datetime(2023, 6, 30, 0, 0, 0)
stations = pd.read_csv('data/stations.csv', index_col=0)
total_station = len(stations)
done_stations = 0
for station_name in stations['station_name']:
    now = time.time()
    station = parameter.get_station(station=station_name, start_date=start_date, end_date=end_date, tph=1, weekday=True)
    parameter.pickle_station(station)
    done_stations += 1
    print(station_name, 'pickled in', round(time.time() - now, 2), 'seconds')
    if done_stations % 10 == 0:
        print(done_stations, 'completed out of', total_station)

yes


In [19]:
station_information = simplejson.load(urllib.request.urlopen('https://gbfs.lyft.com/gbfs/2.3/bkn/en/station_information.json'))
station_status = simplejson.load(urllib.request.urlopen('https://gbfs.lyft.com/gbfs/2.3/bkn/en/station_status.json'))

for station in station_information['data']['stations']:
    print(station['name'], station['lat'], station['lon'])

['Allen St & Stanton St',
 'Carlton Ave & Dean St',
 'W 84 St & Amsterdam Ave',
 'E 85 St & York Ave',
 'Bergen St & 4 Ave',
 'Central Park W & W 91 St',
 'E 15 St & 3 Ave',
 'Riverside Dr & W 78 St',
 'E 44 St & Lexington Ave',
 'Wythe Ave & N 13 St',
 'E 53 St & 3 Ave',
 'Spring St & Hudson St',
 'Willoughby Ave & Tompkins Ave',
 'Bridge St & Water St',
 'W 41 St & 8 Ave',
 'Cooper Square & Astor Pl',
 'Broadway & W 53 St',
 '10 St & 7 Ave',
 'E 7 St & Ave B',
 'Vernon Blvd & 47 Rd',
 '46 Ave & 5 St',
 'Bayard St & Baxter St',
 'Old Slip & South St',
 'Lexington Ave & E 111 St',
 'Macon St & Nostrand Ave',
 'W 54 St & 11 Ave',
 'Clarkson Ave & E 37 St',
 'Grand Concourse & E Mount Eden Ave',
 'Sterling Pl & Franklin Ave',
 'Broadway & W 41 St',
 'Delancey St & Eldridge St',
 'W 111 St & 5 Ave',
 'Clermont Ave & Lafayette Ave',
 '23 Rd & 31 St',
 'Cherry St',
 'Caton Ave & Bedford Ave',
 'W 59 St & 10 Ave',
 'Court St & State St',
 'W Broadway & Watts St',
 'Meserole Ave & Manhattan A

In [15]:
"""
Length of stations_df:  197764
Saving stations, ['Brevoort Pl & Bedford Ave' '19 St & 24 Ave' 'Valentine Ave & E 181 St'
 '5 Ave & E 87 St' 'Henry St & Degraw St' 'Steinway St & 21 Ave'
 'W 163 St & Edgecombe Ave' 'Steinway St & Ditmars Blvd'
 'Lexington Ave & E 120 St' 'Cathedral Pkwy & Broadway'
 'Jerome Ave & Ogden Ave' 'Nelson Ave & 167 St'
 'Louis Nine Blvd & Intervale Ave' 'Aqueduct Ave & W 190 St'
 'Bedford Ave & Fenimore St' '61 St & Borden Ave' '85 St & Northern Blvd'
 'Arthur Ave & E Tremont Ave' 'Washington Ave & Park Pl'
 '60 Ave & Junction Blvd']
"""