In [24]:
#########################################
# Run once - setup functions            #
# Download .zip files to data folder    #
#########################################
import os
import time

import numpy as np
import simplejson, urllib
import pandas as pd


In [1]:
years = [str(year) for year  in range(2014, 2025)]
months = [str(month) for month in range(1, 13)]
months_name = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

if not os.path.exists("data/"):
  os.makedirs("data/")
  
for year in years:
  path = "data/" + year + "-citibike-tripdata"
  if os.path.exists(path):
    os.rename(path, "data/" + year + "/")
    print(f"Renaming {year}-citibike-tripdata")
  for month in months:
    path = "data/" + year + "/" + month + "_" + months_name[int(month)-1] + "/"
    if os.path.exists(path):
      new_path = "data/" + year + "/" + month + "/"
      os.rename(path, new_path)
      for n in range(1, len(os.listdir(new_path))+1):
        path = new_path + str(int(year)*100+int(month)) + "-citibike-tripdata_" + str(n) + ".csv"
        if os.path.exists(path):
          new_path = new_path + str(n) + ".csv"
          os.rename(path, new_path)          

In [86]:
df = pd.read_csv('data/2023/12/1.csv', low_memory=False)
stations = df['start_station_name'].unique()
pd.DataFrame(stations, columns=['station_name']).to_csv('data/stations.csv')

locations = []
for station in stations:
  if station in df['start_station_name']:
    lat = df.loc[df['start_station_name'] == station]['start_lat'].iloc[0]
    lng = df.loc[df['start_station_name'] == station]['start_lng'].iloc[0]
    locations.append([station, lat, lng])
    print(station, lat, lng)
new_df = pd.DataFrame(locations, columns=['station_name', 'lat', 'lng'])
new_df.to_csv('data/station_locations.csv')

In [2]:
API_KEY = 'AIzaSyDq9a1qwhdfHONTthcvM2AOHMOuogWAkAY' # PRIVATE

station_locations = pd.read_csv('data/station_information.csv')
stations = pd.read_csv('data/stations.csv', index_col=0)
stations = stations['station_name'][:10]
done = {station: {} for station in stations}

for orig_station in stations[:5]:
  station_info = station_locations.loc[station_locations['station_name'] == orig_station]
  orig_lat = station_info['lat'].iloc[0]
  orig_lng = station_info['lng'].iloc[0]
  for dest_station in stations:
    if orig_station in done[dest_station]:
      done[orig_station][dest_station] = done[dest_station][orig_station]
    else:
      station_info = station_locations.loc[station_locations['station_name'] == dest_station]
      dest_lat = station_info['lat'].iloc[0]
      dest_lng = station_info['lng'].iloc[0]
      url = f"https://maps.googleapis.com/maps/api/distancematrix/json?origins={orig_lat},{orig_lng}&destinations={dest_lat},{dest_lng}&mode=biking&language=en-EN&key={API_KEY}"
      result = simplejson.load(urllib.request.urlopen(url))
      if 'duration' in result['rows'][0]['elements'][0]:
        trip_time = result['rows'][0]['elements'][0]['duration']['value']
      else:
        trip_time = np.nan
      done[orig_station][dest_station] = trip_time
          
df = pd.DataFrame(done)
df.to_csv('data/test_dist.csv')
    
    

In [3]:
years = [str(year) for year  in range(2014, 2025)]
months = [str(month) for month in range(1, 13)]
year = '2023'
stations = pd.read_csv('data/stations.csv')
# columns = ['ride_id', 'rideable_type', 'started_at', 'ended_at',
#        'start_station_name', 'start_station_id', 'end_station_name',
#        'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
#        'member_casual']
start = 1060 # stopped somewhere around here 
end = len(stations)
window_size = 20

while start < end:
  total_time = time.time()
  end_window = start + window_size
  if start + window_size > end:
    end_window = end
  stations_list = []
  for month in months:
    print('Loading month', month)
    path = f'data/{year}/{month}/'
    for file in os.listdir(path):
      if file.endswith(".csv"):
        now = time.time()
        df = pd.read_csv(path + file, low_memory=False)
        print('Loading dataframe took: ', round(time.time() - now, 4))
        unique_stations = df['start_station_name'].unique()
        stations_to_add = stations['station_name'][start:end_window]
        stations_list.append(df.loc[df['start_station_name'].isin(stations_to_add)])
  now = time.time()
  stations_df = pd.concat(stations_list, ignore_index=True, sort=False)     
  print('Concat took,', round(time.time() - now, 4))
  print("Length of stations_df: ", len(stations_df['start_station_name']))
  if not os.path.exists(f'data/{year}/by_station'):
    os.makedirs(f'data/{year}/by_station')
  print('Saving stations,', stations_df['start_station_name'].unique())
  for station in stations_df['start_station_name'].unique():
    to_save = stations_df.loc[stations_df['start_station_name'] == station].reset_index(drop=True)
    to_save.to_csv(f'data/{year}/by_station/{station}.csv')
  start = end_window
  print('Loop took,', round(time.time() - total_time, 4))

Loading month 1
Loading dataframe took:  4.0694
Loading dataframe took:  3.1144
Loading month 2
Loading dataframe took:  3.4114
Loading dataframe took:  3.1054
Loading month 3
Loading dataframe took:  3.8536
Loading dataframe took:  0.9629
Loading dataframe took:  3.9409
Loading month 4
Loading dataframe took:  4.0615
Loading dataframe took:  3.6312
Loading dataframe took:  4.2488
Loading month 5
Loading dataframe took:  2.5032
Loading dataframe took:  4.0351
Loading dataframe took:  3.8721
Loading dataframe took:  4.0465
Loading month 6
Loading dataframe took:  2.3938
Loading dataframe took:  3.8262
Loading dataframe took:  4.2167
Loading dataframe took:  4.2442
Loading month 7
Loading dataframe took:  3.1235
Loading dataframe took:  3.5604
Loading dataframe took:  4.162
Loading dataframe took:  4.0486
Loading month 8
Loading dataframe took:  0.4312
Loading dataframe took:  4.4189
Loading dataframe took:  3.9193
Loading dataframe took:  4.022
Loading dataframe took:  4.0823
Loading mo

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [15]:
"""
Length of stations_df:  197764
Saving stations, ['Brevoort Pl & Bedford Ave' '19 St & 24 Ave' 'Valentine Ave & E 181 St'
 '5 Ave & E 87 St' 'Henry St & Degraw St' 'Steinway St & 21 Ave'
 'W 163 St & Edgecombe Ave' 'Steinway St & Ditmars Blvd'
 'Lexington Ave & E 120 St' 'Cathedral Pkwy & Broadway'
 'Jerome Ave & Ogden Ave' 'Nelson Ave & 167 St'
 'Louis Nine Blvd & Intervale Ave' 'Aqueduct Ave & W 190 St'
 'Bedford Ave & Fenimore St' '61 St & Borden Ave' '85 St & Northern Blvd'
 'Arthur Ave & E Tremont Ave' 'Washington Ave & Park Pl'
 '60 Ave & Junction Blvd']
"""