In [136]:
import googlemaps
import os
import dotenv
from datetime import datetime
import pandas as pd
import numpy as np

dotenv.load_dotenv()

True

#### Some Initial setup details

- [As of Feb 22, 2024], Be in aware that it costs \$ $\frac{5}{1000}$ per request. 
  - example, 118 origin x 118 destination = 13924 requests = 69.62 USD

In [142]:
def get_variable_name(curr_hour: datetime, mode: str) -> str:

    hour = f"{curr_hour.hour:02d}"
    minute = f"{curr_hour.minute:02d}"

    column_name_for_time_at_this_time = f"T{hour}{minute}_{mode}_(In Minutes)"
    output_file_name = f"T{hour}{minute}_{mode}.parquet"

    return column_name_for_time_at_this_time, output_file_name


def print_report(curr_hour: datetime, mode: str, output_file_name: str, col_name: str) -> None:
    print("Current Run of the Program with the following parameters:")
    print(f"Current Hour: {curr_hour}")
    print(f"Mode: {mode}")
    print(f"Column Name: {col_name}")
    print(f"Output File Name: {output_file_name}")

## Program Setup

In [141]:
hours = [
    datetime(2024, 2, 26, 7, 30, 0),
    datetime(2024, 2, 26, 12, 00, 0),
    datetime(2024, 2, 26, 16, 00, 0),
    datetime(2024, 2, 26, 20, 15, 0)
]


time_compute = hours[0]
mode = "trasit"
# mode = "bicycling"




run = False # Prevent accidental runs

# Test random sample of data from the resulting matrix if they are correct
test = False # Prevent accidental runs
num_sample_size = 20 #tests random rows of data if the distance and time are correct

api_key = os.getenv("GOOGLE_MAPS_API_KEY_2")

col_name, output_file_name = get_variable_name(time_compute, mode)

print_report(time_compute, mode, output_file_name, col_name)

Current Run of the Program with the following parameters:
Current Hour: 2024-02-26 07:30:00
Mode: trasit
Column Name: T0730_trasit_(In Minutes)
Output File Name: T0730_trasit.parquet


In [105]:
# initialize the google maps client
gmaps = googlemaps.Client(key=api_key)

#### Get Kiosk Data

In [106]:
kiosk_data = pd.read_csv(os.path.join(os.environ['MOHAMMAD_SHARED_PATH'], "Google Trip Data","Kiosks_Data.csv"))

In [131]:
kiosk_data_filterd = kiosk_data[['Kiosk Name', "Latitude", "Longitude", 'Address']].copy()
kiosk_data_filterd['Coordinates'] = kiosk_data_filterd['Latitude'].astype(str) + ',' + kiosk_data_filterd['Longitude'].astype(str)

# remove rows with coordinates 0.0,0.0
kiosk_data_filterd = kiosk_data_filterd[kiosk_data_filterd['Coordinates']!="0.0,0.0"]

# remove this coordinate 39.97233,-75.145 = 1330 N 5th St #2, Philadelphia, PA 19122, USA
kiosk_data_filterd = kiosk_data_filterd[kiosk_data_filterd['Coordinates']!="39.97233,-75.145"]

# remove duplicate rows
kiosk_data_filterd = kiosk_data_filterd.drop_duplicates().reset_index(drop=True)
kiosk_data_filterd.sort_values(by='Coordinates', inplace=True)

In [130]:
kiosk_data_filterd

Unnamed: 0,Kiosk Name,Latitude,Longitude,Address,Coordinates
0,Mahoney State Park,41.03038,-96.31184,Mahoney State Park,"41.03038,-96.31184"
1,Walnut Creek Recreation Area (Papillion),41.13997,-96.06433,11601 S 96th St,"41.13997,-96.06433"
2,36th & Raynor Parkway (Bellevue),41.14896,-95.96814,W Papio Trail,"41.14896,-95.96814"
3,Prairie Queen Recreation Area (Papillion),41.15487,-96.11240,Lincoln Rd,"41.15487,-96.1124"
4,Papillion Landing,41.15593,-96.05576,1046 W Lincoln St,"41.15593,-96.05576"
...,...,...,...,...,...
111,24th & Wirt St,41.28661,-95.94713,3014 N 24th Street,"41.28661,-95.94713"
112,NOTC 31st Ave & Taylor,41.29800,-95.95830,4308 N 31st Ave,"41.298,-95.9583"
113,MCC Fort Bookstore N 32nd St,41.30424,-95.95923,N 32nd St,"41.30424,-95.95923"
114,MCC North 30th St,41.30981,-95.95684,N 30th St,"41.30981,-95.95684"


In [132]:
coordinates = kiosk_data_filterd['Coordinates'].values  

len(coordinates)

116

result is in shape of 

origin x destination

In [110]:
res_dict = {}

count = 0
if run:
    for i in range(0, len(coordinates)-1, 10):
        for j in range(0, len(coordinates)-1, 10):
            org_start = i
            org_end = min(i+10, len(coordinates)) ### its logical to subtract 1 from len here but, when slicing, the right is exclusive
            dest_start = j
            dest_end = min(j+10, len(coordinates))

            if (org_end -org_start == 0) or (dest_end - dest_start == 0):
                print("No more data to compute. Exiting... this loop")
            else:
                origins = coordinates[org_start:org_end]
                destinations = coordinates[dest_start:dest_end]



                res = gmaps.distance_matrix(origins, destinations, mode=mode, departure_time=time_compute)

                
                ### result is in shape of org x dest
                ### Result is in res['rows'][org]['elements'][dest]
                for org in range(len(origins)):

                    for dest in range(len(destinations)):

                        if res['rows'][org]['elements'][dest]['status'] == 'OK':
                            res_dict[count] = {
                                "Destination_Coordinates": destinations[dest],
                                "Origin_Coordinates": origins[org],
                                "Origin_Address": res['origin_addresses'][org],
                                "Destination_Address": res['destination_addresses'][dest],
                                "Duration (In Seconds)": res['rows'][org]['elements'][dest]['duration']['value'],
                                "Distance (In Meters)": res['rows'][org]['elements'][dest]['distance']['value']
                            }
                        else:
                            res_dict[count] = {
                                "Destination_Coordinates": destinations[dest],
                                "Origin_Coordinates": origins[org],
                                "Origin_Address": res['origin_addresses'][org],
                                "Destination_Address": res['destination_addresses'][dest],
                                "Duration (In Seconds)": np.nan,
                                "Distance (In Meters)": np.nan
                            }
                        count += 1         

In [137]:
if run:
    # Convert resulting dictionary to a dataframe
    df_res = pd.DataFrame(res_dict).T

    # convert the duration to minutes
    df_res[col_name] = df_res[f'Duration (In Seconds)']/60

    df_res.to_parquet(os.path.join(os.environ['OUTPUT_PATH'], "google_hourly_data", output_file_name))

    df_res.head()

##### Get random rows and test it individually, if the data is correct or not

In [None]:
num_passed = 0

if test:
    if run:
        df_test = df_res.sample(num_sample_size).reset_index(drop=True)

        test_dict = df_test.to_dict(orient='records')
        for i in test_dict:
            res = gmaps.distance_matrix([i['Origin_Coordinates']], [i['Destination_Coordinates']], mode=mode, departure_time=time_compute)

            if res['rows'][0]['elements'][0]['duration']['value'] == i['Duration (In Seconds)'] and res['rows'][0]['elements'][0]['distance']['value'] == i['Distance (In Meters)']:
                num_passed += 1

        print(f"Number of tests passed: {num_passed}/{num_sample_size}")
    else:
        print("No tests were run. Since no data was computed.")
