In [1]:
# import dependencies
import gspread
import json
import pandas as pd
import uuid
from geopy.geocoders import Nominatim
from oauth2client.service_account import ServiceAccountCredentials
from time import sleep

In [2]:
# authenticate and connect to google sheets
scope = [
    "https://spreadsheets.google.com/feeds",
    "https://www.googleapis.com/auth/drive",
]
creds = ServiceAccountCredentials.from_json_keyfile_name("api_keys.json", scope)
client = gspread.authorize(creds)
spreadsheet = client.open_by_key("12L4EkdRqaQ_e42fGHWaTmgCeqQrNgjTfoeAEc5AB6tw")

## Overview

In [3]:
# get data from google sheet
overview_sheet = spreadsheet.worksheet("Overview")
overview_data = overview_sheet.get_all_records()

In [4]:
# convert lists stored as strings into actual lists
def parse_list(value):
    if isinstance(value, str) and value.strip():
        return [item.strip() for item in value.split(",")]
    return []

In [5]:
# default values for missing fields
defaults = {
    "photos": [],
    "description": "",
}

In [6]:
# process data
for entry in overview_data:
    # parse and clean up photos list
    raw_photos = entry.get("photos", "")
    entry["photos"] = [photo.strip('[]"') for photo in parse_list(raw_photos)]
    
    # set default values for missing keys
    for key, default in defaults.items():
        entry.setdefault(key, default)

In [7]:
# save as JSON for JavaScript map
with open("./data/overview.json", "w") as file:
    json.dump(overview_data, file, indent=2)

print("Travel data successfully saved!")

Travel data successfully saved!


## Location and Activity

In [8]:
# load Activity data
activity_sheet = spreadsheet.worksheet("Activity")
activity_data = pd.DataFrame(activity_sheet.get_all_records())

# add activity IDs
activity_data['activity_id'] = [
    str(uuid.uuid4()) if pd.isna(id) or id == "" else id for id in activity_data.get('activity_id', [])
]

In [9]:
# load Location data
location_sheet = spreadsheet.worksheet("Location")
location_data = pd.DataFrame(location_sheet.get_all_records())

# add unique location IDs if missing
location_data['location_id'] = [
    str(uuid.uuid4()) if pd.isna(id) or id == "" else id for id in location_data['location_id']
]

In [10]:
# add any new locations from Activity data to the Location sheet

# get new locations
existing_locations = set(location_data['name'])
new_locations = activity_data[~activity_data['location_name'].isin(existing_locations)]

new_locations_to_add = []

# add new locations to Location sheet
for _, row in new_locations.iterrows():
    new_location_id = str(uuid.uuid4())  # new unique location_id
    new_location = {
        'name': row['location_name'],
        'location_id': new_location_id,
        'location': row['location_name'],
        'lat': "",  # null placeholder
        'lng': ""   # null placeholder
    }
    new_locations_to_add.append(new_location)

# append new locations to the existing DataFrame
new_locations_df = pd.DataFrame(new_locations_to_add)
location_data = pd.concat([location_data, new_locations_df], ignore_index=True)

In [11]:
# assign Activity.location_id, by matching Activity.location_name with Location.name

# create dict for mapping location names to IDs
location_name_to_id = dict(zip(location_data['name'], location_data['location_id']))
# assign location IDs to activities, using the mapping dict
activity_data['location_id'] = activity_data['location_name'].map(location_name_to_id)

# print unmatched location names
unmatched_activities = activity_data[activity_data['location_id'].isna()]
if not unmatched_activities.empty:
    print("Unmatched location names:")
    print(unmatched_activities[['activity_id', 'location_name']])
    # default blank string for unmatched location IDs
    activity_data['location_id'].fillna("", inplace=True)

In [12]:
# geocode locations
geolocator = Nominatim(user_agent="geoapi", timeout=10)

# cache for geocoding results, to avoid repeated requests / rate limits
try:
    with open('./data/geocode_cache.json', 'r') as cache_file:
        geocode_cache = json.load(cache_file)
except FileNotFoundError:
    geocode_cache = {}

def geocode_location(location_name):
    # check cache first
    if location_name in geocode_cache:
        return geocode_cache[location_name]['lat'], geocode_cache[location_name]['lng']
    try:
        location = geolocator.geocode(location_name)
        if location:
            lat_lng = {'lat': location.latitude, 'lng': location.longitude}
            geocode_cache[location_name] = lat_lng  # cache result
            return lat_lng['lat'], lat_lng['lng']
        else:
            return pd.Series([None, None])
    except Exception as e:
        print(f"Error geocoding {location_name}: {e}")
        return pd.Series([None, None])

# geocode if lat/lng are missing
for index, row in location_data.iterrows():
    if not row['lat'] or not row['lng'] or pd.isna(row['lat']) or pd.isna(row['lng']):
        lat, lng = geocode_location(row['location'])
        print(f"Geocoding {row['location']}...")
        location_data.at[index, 'lat'] = lat
        location_data.at[index, 'lng'] = lng
        
        # save progress every 5 requests
        if index % 5 == 0:
            location_data.to_csv('./data/Location.csv', index=False)
            with open('./data/geocode_cache.json', 'w') as cache_file:
                json.dump(geocode_cache, cache_file) 
        
        # for Nominatim rate limits
        # sleep(1)

# save final cache
with open('./data/geocode_cache.json', 'w') as cache_file:
    json.dump(geocode_cache, cache_file)

In [13]:
# save updated data to CSV for JavaScript map
location_data.to_csv('./data/Location.csv', index=False)
activity_data.to_csv('./data/Activity.csv', index=False)

print("Location and Activity data successfully saved!")

Location and Activity data successfully saved!


In [14]:
# upload updated data to Google Sheets

# replace NaN or None values with empty strings for Google Sheets compatibility
location_data = location_data.fillna("")

# convert dfs to lists of lists
location_data_list = [location_data.columns.values.tolist()] + location_data.values.tolist()
activity_data_list = [activity_data.columns.values.tolist()] + activity_data.values.tolist()

# print(location_data_list)

# upload Location sheet
location_sheet = spreadsheet.worksheet("Location")
try:
    location_sheet.clear()  # clear existing data
    location_sheet.update(values=location_data_list, range_name='A1')  # upload new data
except Exception as e:
    print(f"Error updating Location sheet: {e}")

# upload Activity sheet
activity_sheet = spreadsheet.worksheet("Activity")
try:
    activity_sheet.clear()  # clear existing data
    activity_sheet.update(values=activity_data_list, range_name='A1')  # upload new data
except Exception as e:
    print(f"Error updating Activity sheet: {e}")

print("Data successfully uploaded to Google Sheets!")

Data successfully uploaded to Google Sheets!


In [15]:
# # # bounding box for St. John, USVI
# # viewbox = [[18.3839, -64.7922], [18.3080, -64.6823]]

# # # bounding box for Anegada, British Virgin Islands
# # viewbox = [[18.7666, -64.4050], [18.6833, -64.2694]]

# # Bounding box for Cabo Pulmo, Mexico
# viewbox = [[23.4700, -109.4500], [23.4000, -109.3500]]

# # example queries
# queries = ["beach", "bay", "park", "trail"]

# # geocode each query within the bounding box
# for query in queries:
#     location = geolocator.geocode(query, viewbox=viewbox, bounded=True)
#     if location:
#         print(f"{query}: {location.latitude}, {location.longitude} - {location.address}")
#     else:
#         print(f"{query}: Location not found")

In [16]:
# import overpy

# # initialize Overpass API
# api = overpy.Overpass()

# # all points of interest in St. John, USVI
# query = """
# [out:json];
# (
#   node["name"](18.3080,-64.7922,18.3839,-64.6823);
#   way["name"](18.3080,-64.7922,18.3839,-64.6823);
#   relation["name"](18.3080,-64.7922,18.3839,-64.6823);
# );
# out center;
# """

# # run the query
# result = api.query(query)

# # print results
# for node in result.nodes:
#     print(f"Node: {node.tags.get('name', 'Unnamed')} - {node.lat}, {node.lon}")

# for way in result.ways:
#     print(f"Way: {way.tags.get('name', 'Unnamed')}")

# for relation in result.relations:
#     print(f"Relation: {relation.tags.get('name', 'Unnamed')}")