In [24]:
import os
import pandas as pd
import sys
from pathlib import Path
import numpy as np
import json

In [25]:
path = os.getcwd()
p = Path(path).parents[1]
directory_path = os.path.abspath(p) + "\\data"
file_path = directory_path.replace("\\", "/")

In [26]:
def get_year_files():
    
    files_name = os.listdir(directory_path)
    files = []
    for file_name in files_name:
        if "Mobi_System_Data" in file_name:
            file = file_path + "/" + file_name
            files.append(file)
    return files

In [27]:
def get_year_data(year, files):
    year = str(year)
    print("read " + year)
    for file in files:
        if year in file:
            return pd.read_csv(file, compression="zip")

In [28]:
data_2018 = get_year_data(2018, get_year_files())
data_2019 = get_year_data(2019, get_year_files())
data_2020 = get_year_data(2020, get_year_files())
data_2021 = get_year_data(2021, get_year_files())


read 2018
read 2019
read 2020
read 2021


In [29]:
def get_date_range_departure(year_df, start_date, end_date):
    return year_df.loc[(year_df["Departure"] >= str(start_date)) & (year_df["Departure"] <= str(end_date))]


In [30]:
def group_by_departure_station(_df):
    return _df.groupby(['Departure station']).size().reset_index(name='ride_count').sort_values(["ride_count"], ascending=False)


In [31]:
def normalize_ride_count(_df):
    norm = np.linalg.norm(_df["ride_count"])
    _df["ride_count"] = round(((_df["ride_count"]/norm) * 1000), 4)
    return _df

In [32]:
def get_station_geo():
    geo_file = file_path + "/tmp/geocodings.pickle"
    gps_data = pd.read_pickle(geo_file)
    stationid, station, zip, lon, lat = [], [], [], [], []
    # print(type(gps_data))
    for row in gps_data.items():
        station.append(row[0])
        stationid.append(row[0].split(" ")[0])
        zip.append(row[1][0])
        lat.append(float(str(row[1][1])[0:10]))
        lon.append(float(str(row[1][2])[0:12]))
        
        # print("station id: {},    station name: {}".format(station, station))
    # print("length station {}, zip: {}, lon: {}, lat:{}".format(len(station), len(zip), len(lon), len(lat)))
    station_df = pd.DataFrame()
    station_df['station_id'] = stationid
    station_df['station'] = station
    station_df['zip'] = zip
    station_df["lon"] = lon
    station_df["lat"] = lat
    station_df = station_df.sort_values(["station_id"], ascending=True)
    return station_df.reset_index()


In [13]:
df = get_date_range_departure(data_2020, "2020-10-01", "2020-11-01")
df_sorted = group_by_departure_station(df)
df_sorted_normalized = normalize_ride_count(df_sorted)
df_station_geo = get_station_geo()


In [40]:
df = get_date_range_departure(data_2018, "2018-07-01", "2018-08-01")
df

Unnamed: 0.1,Unnamed: 0,Departure,Return,Bike,Departure station,Return station,Formula,Covered distance (m),Duration (sec.),Departure battery voltage (mV),Return battery voltage (mV),Departure temperature (°C),Return temperature (°C),Stopover duration (sec.),Number of stopovers
298062,298062,2018-07-01 00:00:00,2018-07-01 00:00:00,1040.0,0012 Dunsmuir & Richards,0138 Richards & Helmcken,365 Day Pass Plus SALE,884.0,280,3800,3910,17,18,0,0
298063,298063,2018-07-01 00:00:00,2018-07-01 00:00:00,94.0,0055 Keefer & Columbia,0053 Keefer & Abbott,365 Plus,514.0,177,3785,3877,18,19,0,0
298064,298064,2018-07-01 00:00:00,2018-07-01 00:00:00,1030.0,0039 Helmcken & Burrard,0064 Expo & Smithe,24 Hour,1899.0,1346,4021,4084,17,17,0,0
298065,298065,2018-07-01 00:00:00,2018-07-01 00:00:00,1096.0,0039 Helmcken & Burrard,0064 Expo & Smithe,24 Hour,1878.0,652,4019,4126,17,19,0,0
298066,298066,2018-07-01 00:00:00,2018-07-01 00:00:00,983.0,0039 Helmcken & Burrard,0064 Expo & Smithe,24 Hour,1931.0,621,3995,4080,18,20,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412500,412500,2018-07-01 00:00:00,2018-07-01 00:00:00,1030.0,0064 Expo & Smithe,0030 Abbott & Cordova,24 Hour,3120.0,1310,4016,4137,18,18,0,0
412501,412501,2018-07-01 00:00:00,2018-07-01 00:00:00,1307.0,0063 Robson & Granville,0196 Drake & Hornby,Archived Monthly Standard,1076.0,365,3653,3727,18,20,0,0
412502,412502,2018-07-01 00:00:00,2018-07-01 00:00:00,950.0,0064 Expo & Smithe,0030 Abbott & Cordova,24 Hour,3193.0,1335,4042,4150,18,19,0,0
412503,412503,2018-07-01 00:00:00,2018-07-01 00:00:00,744.0,0044 Richards & Beach,0027 Beatty & Nelson,365 Day Founding Plus,1302.0,460,3960,4059,17,18,0,0


In [14]:
df_merged = pd.merge(left=df_sorted_normalized, right=df_station_geo, how='left', left_on='Departure station', right_on='station').dropna()
df_merged


Unnamed: 0,Departure station,ride_count,index,station_id,station,zip,lon,lat
0,0209 Stanley Park - Information Booth,179.7039,33.0,0209,0209 Stanley Park - Information Booth,V6G3E2,-123.131478,49.297769
1,0105 Stanley Park - Totem Poles,178.2777,32.0,0105,0105 Stanley Park - Totem Poles,V6G,-123.121176,49.298657
2,0028 Davie & Beach,175.9007,40.0,0028,0028 Davie & Beach,V6G,-123.142139,49.287528
3,0174 1st & Manitoba,174.9499,205.0,0174,0174 1st & Manitoba,R7A6C3,-99.938347,49.838303
4,0050 Bute & Comox,171.1466,50.0,0050,0050 Bute & Comox,V6E1K6,-123.131099,49.283343
...,...,...,...,...,...,...,...,...
197,0233 Thornton & National,9.7458,197.0,0233,0233 Thornton & National,V6A3X2,-123.092723,49.273777
198,0985 Quebec Yard - To Service,9.7458,241.0,0985,0985 Quebec Yard - To Service,H2Z1X4,-73.563392,45.506058
199,0981 Workshop - Service Complete,7.8442,219.0,0981,0981 Workshop - Service Complete,L7L5H7,-79.763537,43.390667
200,0285 Commercial & Pandora,4.5164,191.0,0285,0285 Commercial & Pandora,V5L0A3,-123.070443,49.283236


In [15]:
station_list = []

for idx, row in df_merged.iterrows():
    v1, v2 = str(row['lat']), str(row['lon'])
    v3 = row['ride_count']
    st = "location: new google.maps.LatLng({}, {}), weight: {}".format(v1, v2, v3)
    station_list.append("{" +st + "},")

with open('station_google_cmd.txt', 'w') as f:
    for station in station_list:
        f.write(station)
        f.write('\n')

In [35]:
for year in [2018, 2019, 2020, 2021]:
    for month in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
# for year in [2018]:
#     for month in [11, 12]:
        start_date = str(year) + "-" + str(month) + "-01"
        end_date = str(year) + "-" + str(month + 1) + "-01"
        
        if month < 10:
            start_date = str(year) + "-0" + str(month) + "-01"
            end_date = str(year) + "-0" + str(month + 1) + "-01"
        if month == 9:
            end_date = str(year) + "-" + str(month + 1) + "-01"
        if month == 12:
            end_date = str(year) + "-" + str(month) + "-31"
        print("get {} - {}".format(start_date, end_date))
        if year == 2018:
            data = data_2018
        elif year == 2019:
            data = data_2019
        elif year == 2020:
            data = data_2020
        elif year == 2021:
            data = data_2021
        df = get_date_range_departure(data, start_date, end_date)
        df_sorted = group_by_departure_station(df)
        df_sorted_normalized = normalize_ride_count(df_sorted)
        df_station_geo = get_station_geo()
        df_merged = pd.merge(left=df_sorted_normalized, right=df_station_geo, how='left', left_on='Departure station', right_on='station').dropna()
        
        if month < 10:
            filename = file_path + "/headmap_monthly/" + str(year) + "-0" + str(month) + '.json'
        else:
            filename = file_path + "/headmap_monthly/" + str(year) + "-" + str(month) + '.json'
        df_filtered = df_merged[['Departure station', 'lat', 'lon', 'ride_count']]
        df_filtered.to_json(filename, orient='index')
  

get 2018-1-01 - 2018-2-01
get 2018-2-01 - 2018-3-01
get 2018-3-01 - 2018-4-01
get 2018-4-01 - 2018-5-01
get 2018-5-01 - 2018-6-01
get 2018-6-01 - 2018-7-01
get 2018-7-01 - 2018-8-01
get 2018-8-01 - 2018-9-01
get 2018-9-01 - 2018-10-01
get 2018-10-01 - 2018-11-01
get 2018-11-01 - 2018-12-01
get 2018-12-01 - 2018-12-31
get 2019-1-01 - 2019-2-01
get 2019-2-01 - 2019-3-01
get 2019-3-01 - 2019-4-01
get 2019-4-01 - 2019-5-01
get 2019-5-01 - 2019-6-01
get 2019-6-01 - 2019-7-01
get 2019-7-01 - 2019-8-01
get 2019-8-01 - 2019-9-01
get 2019-9-01 - 2019-10-01
get 2019-10-01 - 2019-11-01
get 2019-11-01 - 2019-12-01
get 2019-12-01 - 2019-12-31
get 2020-1-01 - 2020-2-01
get 2020-2-01 - 2020-3-01
get 2020-3-01 - 2020-4-01
get 2020-4-01 - 2020-5-01
get 2020-5-01 - 2020-6-01
get 2020-6-01 - 2020-7-01
get 2020-7-01 - 2020-8-01
get 2020-8-01 - 2020-9-01
get 2020-9-01 - 2020-10-01
get 2020-10-01 - 2020-11-01
get 2020-11-01 - 2020-12-01
get 2020-12-01 - 2020-12-31
get 2021-1-01 - 2021-2-01
get 2021-2-01 - 2

In [37]:
filename = file_path + "/headmap_monthly/" + str(year) + "-" + str(month) + '.json'
print(filename)
f = open(filename)
data = json.load(f)

d:/Class/projects/733project/733_Final_Project/data/headmap_monthly/2018-12.json


In [17]:
for year in [2018, 2019, 2020, 2021]:
    start_date = str(year) + "-01-01"
    end_date = str(year + 1) + "-01-01"
    if year == 2021:
        end_date = str(year) + "-"+ "12-31"
    print("get {} - {}".format(start_date, end_date))
    if year == 2018:
        data = data_2018
    elif year == 2019:
        data = data_2019
    elif year == 2020:
        data = data_2020
    elif year == 2021:
        data = data_2021
    df = get_date_range_departure(data, start_date, end_date)
    df_sorted = group_by_departure_station(df)
    df_sorted_normalized = normalize_ride_count(df_sorted)
    df_station_geo = get_station_geo()
    df_merged = pd.merge(left=df_sorted_normalized, right=df_station_geo, how='left', left_on='Departure station', right_on='station').dropna()
    
    filename = file_path + "/headmap_monthly/" + str(year) + "-0" + '.json'
    df_filtered = df_merged[['Departure station', 'lat', 'lon', 'ride_count']]
    df_filtered.to_json(filename, orient='index')

get 2018-01-01 - 2019-01-01
get 2019-01-01 - 2020-01-01
get 2020-01-01 - 2021-01-01
get 2021-01-01 - 2021-12-31


In [7]:
# import urllib.request, json 
# with urllib.request.urlopen("https://vancouver-gbfs.smoove.pro/gbfs/en/station_information.json") as url:
#     s_data = json.loads(url.read().decode())
# s_df = pd.json_normalize(s_data['data']['stations'])
# s_df['station'] = s_df['station_id'] + " " + s_df['name'] 
# s_df.to_csv(r'np.txt', header=None, index=None, sep='\t', mode='a')

Unnamed: 0,station_id,name,lat,lon,capacity,station
0,0001,10th & Cambie,49.262487,-123.114397,35,0001 10th & Cambie
1,0002,Burrard Station (Melville & Dunsmuir),49.285871,-123.121050,28,0002 Burrard Station (Melville & Dunsmuir)
2,0004,Yaletown-Roundhouse Station,49.274566,-123.121817,16,0004 Yaletown-Roundhouse Station
3,0005,Dunsmuir & Beatty,49.279764,-123.110154,26,0005 Dunsmuir & Beatty
4,0006,Olympic Village Station,49.266314,-123.116011,26,0006 Olympic Village Station
...,...,...,...,...,...,...
204,0297,6th & Prince Edward,49.265242,-123.095577,16,0297 6th & Prince Edward
205,0298,6th & Carolina,49.265213,-123.090785,16,0298 6th & Carolina
206,0300,Bute & Barclay,49.284893,-123.128685,14,0300 Bute & Barclay
207,0305,Georgia & Homer,49.280787,-123.115271,18,0305 Georgia & Homer
