# Imports

In [96]:
import time
import json
import pickle
from collections import Counter

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import folium
from folium.plugins import HeatMap

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.subplots as sp
from plotly.graph_objects import Scattergeo

import googlemaps
from google.maps import places_v1

from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from geopy.distance import geodesic
from geopy.point import Point

DATAPATH = 'data/'

google_api_key = 'AIzaSyAlkd93UPS_DUWNMronHGGNIwj3gwBbgfo'

In [None]:
# !uv pip install --upgrade googlemaps

[2mUsing Python 3.11.9 environment at tiletracker[0m
[2mResolved [1m6 packages[0m [2min 658ms[0m[0m
[2mPrepared [1m1 package[0m [2min 5.74s[0m[0m
[2mInstalled [1m1 package[0m [2min 49ms[0m[0m
 [32m+[39m [1mgooglemaps[0m[2m==4.10.0[0m


# Helper Function

In [43]:
def create_folium_heatmap(df: pd.DataFrame) -> folium.Map:
    """
    Generates an interactive heatmap using the Folium library from a Pandas DataFrame.
    The map's initial zoom and center are automatically set to fit the data.

    Args:
        df (pd.DataFrame): A Pandas DataFrame expected to contain the following columns:
                           - 'latitude': Numerical column for latitude coordinates.
                           - 'longitude': Numerical column for longitude coordinates.
                           - 'datetime': (Optional, not directly used for heatmap intensity but good for context)
                                         Datetime column.

    Returns:
        folium.Map: A Folium Map object with the heatmap layer added.
                    The map can be saved to an HTML file or displayed in a Jupyter notebook.

    Raises:
        ValueError: If 'latitude' or 'longitude' columns are missing from the DataFrame.
    """
    # Validate required columns
    required_columns = ['latitude', 'longitude']
    if not all(col in df.columns for col in required_columns):
        missing_cols = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"DataFrame must contain the following columns: {required_columns}. Missing: {missing_cols}")

    # Prepare data for heatmap: list of [latitude, longitude] pairs
    # If you wanted to add intensity, you could add a third element [lat, lon, intensity]
    heat_data = [[row['latitude'], row['longitude']] for index, row in df.iterrows()]

    # Calculate the bounds of the data for automatic zooming
    min_lat, max_lat = df['latitude'].min(), df['latitude'].max()
    min_lon, max_lon = df['longitude'].min(), df['longitude'].max()
    bounds = [[min_lat, min_lon], [max_lat, max_lon]]

    # Create a base Folium map, setting the initial view to fit the data bounds
    # The 'fit_bounds' parameter automatically calculates the best center and zoom level
    m = folium.Map(tiles="OpenStreetMap") # Using default tiles, can specify others like 'Stamen Terrain'

    # Add the HeatMap layer to the map
    # You can customize parameters like radius, blur, min_opacity, max_zoom
    HeatMap(heat_data).add_to(m)

    # Fit the map to the calculated bounds after adding the heatmap
    # This ensures the map view is adjusted to show all data points
    m.fit_bounds(bounds)

    return m

# Check Most Recent

In [156]:
file = r'raw\data_2025-06-04.json'
fdf = pd.read_json(DATAPATH + file, orient='columns')
fdf = pd.DataFrame.from_dict(fdf['06c5863b0ea97d00'].result['location_updates'])

fdf['datetime'] = pd.to_datetime(fdf['location_timestamp'], unit='ms', utc=True)
fdf['date'] = fdf['datetime'].dt.date
fdf['time'] = fdf['datetime'].dt.strftime("%H:%M:%S")
fdf = fdf.groupby(['latitude']).last().reset_index().sort_values(by='datetime')
fdf

Unnamed: 0,latitude,tile_uuid,location_timestamp,raw_precision,longitude,precision,datetime,date,time
6717,13.349930,06c5863b0ea97d00,1746559392956,9.651686,103.857286,9.651686,2025-05-06 19:23:12.956000+00:00,2025-05-06,19:23:12
6719,13.349934,06c5863b0ea97d00,1746560917967,9.650000,103.857286,9.650000,2025-05-06 19:48:37.967000+00:00,2025-05-06,19:48:37
6750,13.350070,06c5863b0ea97d00,1746582746828,5.063683,103.857042,5.063683,2025-05-07 01:52:26.828000+00:00,2025-05-07,01:52:26
6721,13.349945,06c5863b0ea97d00,1746582886240,35.000000,103.857295,35.000000,2025-05-07 01:54:46.240000+00:00,2025-05-07,01:54:46
6718,13.349934,06c5863b0ea97d00,1746583553361,9.651686,103.857286,9.651686,2025-05-07 02:05:53.361000+00:00,2025-05-07,02:05:53
...,...,...,...,...,...,...,...,...,...
3681,-34.576145,06c5863b0ea97d00,1748961565275,14.383521,-58.438703,14.383521,2025-06-03 14:39:25.275000+00:00,2025-06-03,14:39:25
3491,-34.577426,06c5863b0ea97d00,1748961719208,13.761205,-58.438666,13.761205,2025-06-03 14:41:59.208000+00:00,2025-06-03,14:41:59
3505,-34.577223,06c5863b0ea97d00,1748961811855,6.217190,-58.439284,6.217190,2025-06-03 14:43:31.855000+00:00,2025-06-03,14:43:31
3503,-34.577230,06c5863b0ea97d00,1748965314229,34.863200,-58.439344,34.863200,2025-06-03 15:41:54.229000+00:00,2025-06-03,15:41:54


# Combine Data

In [158]:
tilenames = { # From pytile
    '0287c8181aa557e7': 'Maya', # On Maya's Camera
    '02df4813aa180c3a': "Maya's Backpack",
    '06c5863b0ea97d00': 'John', # On Sling Backpack
    '06e9828702df2f1f': "John's Backpack",
    'p!0028e4d51b64dafa7db22c75e373903b': "John's iPhone", # No location recorded
    'p!27a7386a743b1de5fd19cf5c3873dea8': "Maya's iPhone", # No location recorded
    }
tilenames_reverse = {val:key for key,val in tilenames.items()}
tile_uuid = tilenames_reverse['John']
RAWDATAPATH = r'C:\Users\joyam\Documents\JohnProjects\TileTracking\data\raw\\'
files = Path(RAWDATAPATH).glob('*.json')
df = []
for file in files:
    fdf = pd.read_json(file, orient='columns')
    fdf = pd.DataFrame.from_dict(fdf[tile_uuid].result['location_updates'])
    df.append(fdf)
df = pd.concat(df, ignore_index=True)
df['datetime'] = pd.to_datetime(df['location_timestamp'], unit='ms', utc=True)
df['date'] = df['datetime'].dt.date
df['time'] = df['datetime'].dt.strftime("%H:%M:%S")
df['tile_name'] = df['tile_uuid'].map(tilenames)
df = df.groupby(['latitude']).last().reset_index().sort_values(by='datetime')
col_order = ['tile_name', 'tile_uuid',
            'location_timestamp','datetime','date','time',
            'latitude','longitude','raw_precision','precision',]
df = df[col_order]
df

Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision
47297,John,06c5863b0ea97d00,1730098465999,2024-10-28 06:54:25.999000+00:00,2024-10-28,06:54:25,42.096405,-86.488891,12.001812,12.001812
47231,John,06c5863b0ea97d00,1730098924925,2024-10-28 07:02:04.925000+00:00,2024-10-28,07:02:04,42.096386,-86.488880,16.893646,16.893646
47419,John,06c5863b0ea97d00,1730099378101,2024-10-28 07:09:38.101000+00:00,2024-10-28,07:09:38,42.096461,-86.488998,24.000000,24.000000
47206,John,06c5863b0ea97d00,1730100281000,2024-10-28 07:24:41+00:00,2024-10-28,07:24:41,42.096375,-86.488851,8.001208,8.001208
47381,John,06c5863b0ea97d00,1730101200939,2024-10-28 07:40:00.939000+00:00,2024-10-28,07:40:00,42.096440,-86.488870,12.001812,12.001812
...,...,...,...,...,...,...,...,...,...,...
3681,John,06c5863b0ea97d00,1748961565275,2025-06-03 14:39:25.275000+00:00,2025-06-03,14:39:25,-34.576145,-58.438703,14.383521,14.383521
3491,John,06c5863b0ea97d00,1748961719208,2025-06-03 14:41:59.208000+00:00,2025-06-03,14:41:59,-34.577426,-58.438666,13.761205,13.761205
3505,John,06c5863b0ea97d00,1748961811855,2025-06-03 14:43:31.855000+00:00,2025-06-03,14:43:31,-34.577223,-58.439284,6.217190,6.217190
3503,John,06c5863b0ea97d00,1748965314229,2025-06-03 15:41:54.229000+00:00,2025-06-03,15:41:54,-34.577230,-58.439344,34.863200,34.863200


In [159]:
df.to_csv(DATAPATH+f"{tilenames[tile_uuid]}-tile_data_combined.csv")

# Load Data

In [2]:
df = pd.read_csv(DATAPATH + 'John-tile_data_combined.csv')
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision,datetime_delta,distance_delta,speed
0,1,47231,John,06c5863b0ea97d00,1730098924925,2024-10-28 07:02:04,2024-10-28,07:02:04,42.096386,-86.488880,16.893646,16.893646,459.0,0.002304,0.000005
1,2,47419,John,06c5863b0ea97d00,1730099378101,2024-10-28 07:09:38,2024-10-28,07:09:38,42.096461,-86.488998,24.000000,24.000000,454.0,0.012819,0.000028
2,3,47206,John,06c5863b0ea97d00,1730100281000,2024-10-28 07:24:41,2024-10-28,07:24:41,42.096375,-86.488851,8.001208,8.001208,903.0,0.015471,0.000017
3,4,47381,John,06c5863b0ea97d00,1730101200939,2024-10-28 07:40:00,2024-10-28,07:40:00,42.096440,-86.488870,12.001812,12.001812,919.0,0.007415,0.000008
4,5,47325,John,06c5863b0ea97d00,1730102094916,2024-10-28 07:54:54,2024-10-28,07:54:54,42.096417,-86.488872,16.000000,16.000000,894.0,0.002556,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44149,47481,3681,John,06c5863b0ea97d00,1748961565275,2025-06-03 14:39:25,2025-06-03,14:39:25,-34.576145,-58.438703,14.383521,14.383521,166.0,0.218227,0.001315
44150,47482,3491,John,06c5863b0ea97d00,1748961719208,2025-06-03 14:41:59,2025-06-03,14:41:59,-34.577426,-58.438666,13.761205,13.761205,154.0,0.142432,0.000925
44151,47483,3505,John,06c5863b0ea97d00,1748961811855,2025-06-03 14:43:31,2025-06-03,14:43:31,-34.577223,-58.439284,6.217190,6.217190,92.0,0.060945,0.000662
44152,47484,3503,John,06c5863b0ea97d00,1748965314229,2025-06-03 15:41:54,2025-06-03,15:41:54,-34.577230,-58.439344,34.863200,34.863200,3503.0,0.005495,0.000002


# Data Cleaning

In [3]:
df = pd.read_csv(DATAPATH + 'John-tile_data_combined.csv')
# convert datetime to datetime format
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

# add time delta and speed to remove extreme values
df['datetime_delta'] = df['datetime'].diff()[1:].dt.total_seconds()

df['lat_rad'] = df['latitude'] * np.pi/180
df['lat_diff'] = df['lat_rad'].diff()
df['lat_shift'] = df['lat_rad'].shift()
df['lon_rad'] = df['longitude'] * np.pi/180
df['lon_diff'] = df['lon_rad'].diff()
df['lon_shift'] = df['lon_rad'].shift()
df = df.dropna(axis=0)
a = np.sin(df['lat_diff']/2)**2 + np.cos(df['lat_rad']) * np.cos(df['lat_shift']) * np.sin(df['lon_diff']/2)**2
earth_R = 6371 # radius in km
df.loc[:,'distance_delta'] = earth_R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
df.loc[:,'speed'] = df['distance_delta'] / df['datetime_delta']
df = df.drop(columns=['lat_rad', 'lat_diff', 'lat_shift', 'lon_rad', 'lon_diff', 'lon_shift'])
# print(df['speed'].quantile(.999))
prelen = len(df)
df = df[df['datetime_delta'] < df['datetime_delta'].quantile(.99)]
print(f"removed {prelen - len(df)} rows for extreme time delta values")

prelen = len(df)
df['speed'] = df['speed'].replace([np.inf, -np.inf], np.nan)*60*60
df = df.dropna(subset='speed')
df = df[df['speed'] < df['speed'].quantile(.999)]
print(f"removed {prelen - len(df)} rows for extreme speed values")

prelen = len(df)
df = df[df['precision'] < 50]
print(f"removed {prelen - len(df)} rows for low precision")

# remove weird mexico city point -- [DEPRECATED] got fixed with speed values
# mexico_city_lat = 19.4326
# mexico_city_lon = -99.1332
# window = 1
# mexcity_index = df[ df['latitude'].between(mexico_city_lat - window, mexico_city_lat + window) \
#          & df['longitude'].between(mexico_city_lon - window, mexico_city_lon + window)].index
# prelen = len(df)
# df = df.drop(index=mexcity_index)
# print(f"removed {prelen - len(df)} rows for mexico city")

df

removed 442 rows for extreme time delta values
removed 44 rows for extreme speed values
removed 0 rows for low precision


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision,datetime_delta,distance_delta,speed
1,2,47419,John,06c5863b0ea97d00,1730099378101,2024-10-28 07:09:38,2024-10-28,07:09:38,42.096461,-86.488998,24.000000,24.000000,454.0,0.012819,0.101649
2,3,47206,John,06c5863b0ea97d00,1730100281000,2024-10-28 07:24:41,2024-10-28,07:24:41,42.096375,-86.488851,8.001208,8.001208,903.0,0.015471,0.061679
3,4,47381,John,06c5863b0ea97d00,1730101200939,2024-10-28 07:40:00,2024-10-28,07:40:00,42.096440,-86.488870,12.001812,12.001812,919.0,0.007415,0.029047
4,5,47325,John,06c5863b0ea97d00,1730102094916,2024-10-28 07:54:54,2024-10-28,07:54:54,42.096417,-86.488872,16.000000,16.000000,894.0,0.002556,0.010291
5,6,47432,John,06c5863b0ea97d00,1730102988904,2024-10-28 08:09:48,2024-10-28,08:09:48,42.096469,-86.488894,16.000000,16.000000,894.0,0.006095,0.024542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44148,47480,3684,John,06c5863b0ea97d00,1748961399522,2025-06-03 14:36:39,2025-06-03,14:36:39,-34.574939,-58.440583,14.869825,14.869825,138.0,0.192026,5.009376
44149,47481,3681,John,06c5863b0ea97d00,1748961565275,2025-06-03 14:39:25,2025-06-03,14:39:25,-34.576145,-58.438703,14.383521,14.383521,166.0,0.218227,4.732643
44150,47482,3491,John,06c5863b0ea97d00,1748961719208,2025-06-03 14:41:59,2025-06-03,14:41:59,-34.577426,-58.438666,13.761205,13.761205,154.0,0.142432,3.329579
44151,47483,3505,John,06c5863b0ea97d00,1748961811855,2025-06-03 14:43:31,2025-06-03,14:43:31,-34.577223,-58.439284,6.217190,6.217190,92.0,0.060945,2.384818


In [162]:
df.to_csv(DATAPATH+f"{tilenames[tile_uuid]}-tile_data_combined.csv")

In [142]:
# create_folium_heatmap(tdf[ (pd.to_datetime(tdf['date']) > '11-16-2024') & (pd.to_datetime(tdf['date']) < '12-16-2024') ])

# DBScan for clustering

In [114]:
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision,datetime_delta,distance_delta,speed,cluster_label
0,1,47231,John,06c5863b0ea97d00,1730098924925,2024-10-28 07:02:04,2024-10-28,07:02:04,42.096386,-86.48888,16.893646,16.893646,459.0,0.002304,5e-06,0
1,2,47419,John,06c5863b0ea97d00,1730099378101,2024-10-28 07:09:38,2024-10-28,07:09:38,42.096461,-86.488998,24.0,24.0,454.0,0.012819,2.8e-05,0


In [4]:
cols = ['latitude','longitude']
scaler = StandardScaler()
coords = scaler.fit_transform(df[cols])
# coords = df[cols]
db = HDBSCAN(min_cluster_size=5, n_jobs=-1).fit(coords) # eps of .01 is around 1.11km
df['place_cluster_label'] = db.labels_
print(len(set(db.labels_)), len(df[df['place_cluster_label'] == -1]))

1919 10608


In [None]:
# Narrow cluster labels down further using logic
# if all of the direction of a cluster is the same, its likely transit

# define bearing -- [DEPRECATED] found direciton similarity to be better, keeping column b/c I like it
def get_bearing(row):
    lat1, lon1 = row['prev_latitude'], row['prev_longitude']
    lat2, lon2 = row['latitude'], row['longitude']
    dLon = lon2 - lon1;
    y = np.sin(dLon) * np.cos(lat2);
    x = np.cos(lat1)*np.sin(lat2) - np.sin(lat1)*np.cos(lat2)*np.cos(dLon);
    brng = np.rad2deg(np.arctan2(y, x));
    if brng < 0: brng+= 360
    return brng
df['prev_latitude'] = df['latitude'].shift(1)
df['prev_longitude'] = df['longitude'].shift(1)
df['bearing_degrees'] = df.apply(get_bearing, axis=1)

# define columns for difference vectors
df['diff_lat'] = df['latitude'] - df['prev_latitude']
df['prev_diff_lat'] = df['diff_lat'].shift(1)
df['diff_lon'] = df['longitude'] - df['prev_longitude']
df['prev_diff_lon'] = df['diff_lon'].shift(1)

# find direction similarity for difference vectors
def direction_similarity(row):
    a = np.array(row[['prev_diff_lat','prev_diff_lon']]).reshape(1,-1)
    b = np.array(row[['diff_lat','diff_lon']]).reshape(1,-1)
    return cosine_similarity(a,b)[0,0]
df = df.dropna(subset='prev_diff_lat')
df['direction_similarity'] = df.apply(direction_similarity, axis=1)

prev_len = df['place_cluster_label'].nunique()
for cluster_label in list(df['place_cluster_label'].unique()):
    if (cluster_label == -1) | (cluster_label == -2): continue
    cluster = df[df['place_cluster_label'] == cluster_label]
    
    if cluster['direction_similarity'].iloc[1:].mean() > .25: #remove first point because it will reference a point not in the cluster
        cluster_idx = df[df['place_cluster_label'] == cluster_label].index
        df.loc[cluster_idx,'place_cluster_label'] = -3
print(f"{len(df[df['place_cluster_label']==-3])} points labelled as transit (-3)")
print(f"reduced clusters by {prev_len - df['place_cluster_label'].nunique()} from {prev_len} to {df['place_cluster_label'].nunique()}")

# drop unneccessary columns
unnamed = [col for col in df.columns if 'unnamed' in col.lower()]
df = df.drop(columns=['prev_latitude','prev_longitude','diff_lat','prev_diff_lat','diff_lon','prev_diff_lon'] + unnamed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['direction_similarity'] = df.apply(direction_similarity, axis=1)


8860 points labelled as transit (-3)
reduced clusters by 833 from 1919 to 1086


In [6]:
oneday = df[df.date.isin(list(df.date.unique())[-110:-90])] # -110:-90 , -60:-40
oneday = oneday[oneday['place_cluster_label'] > -1]
oneday.loc[:,'color'] = 'red'

fig = make_subplots(rows=1, cols=2) 
fig = px.scatter_geo(oneday, 
                     lat="latitude",
                     lon='longitude',
                     color="place_cluster_label",
                     scope='world',
                     fitbounds='locations',
                     hover_data=['direction_similarity','speed']
                    )
bounds_add = 10
fig.add_traces(Scattergeo(lat=[oneday.latitude.min()-bounds_add, oneday.latitude.max()+bounds_add],
                                               lon=[oneday.longitude.min()-bounds_add, oneday.longitude.max()+bounds_add],
                                               mode = 'markers', marker = dict(size = 2,color = 'rgba(0, 0, 0, 0)')))
fig.update_geos(resolution=50)


fig.show()

# Google Maps API

In [None]:
"""
To run:
Open 'Google Cloud SDK Shell' and run: 
gcloud init

then run: 
gcloud auth application-default login

sign in
"""

In [7]:
# Setup Places Client
places = places_v1.PlacesAsyncClient(
  # Instantiates the Places client, passing the API key
  client_options={"api_key": f"{google_api_key}"}
)
print(places.__getstate__())

# Set up client that includes Reverse Geocoding
gmaps = googlemaps.Client(key=f"{google_api_key}")
gmaps.__getstate__()

{'_client': <google.maps.places_v1.services.places.client.PlacesClient object at 0x00000268322EFC50>}


{'session': <requests.sessions.Session at 0x26832386ed0>,
 'key': 'AIzaSyAlkd93UPS_DUWNMronHGGNIwj3gwBbgfo',
 'timeout': None,
 'client_id': None,
 'client_secret': None,
 'channel': None,
 'retry_timeout': datetime.timedelta(seconds=60),
 'requests_kwargs': {'headers': {'User-Agent': 'GoogleGeoApiClientPython/4.10.0'},
  'timeout': None,
  'verify': True},
 'queries_per_second': 60,
 'queries_per_minute': 6000,
 'queries_quota': 60,
 'retry_over_query_limit': True,
 'sent_times': deque([], maxlen=60),
 'base_url': 'https://maps.googleapis.com'}

In [455]:
df[df['place_cluster_label']==list(df['place_cluster_label'].unique())[0]][['latitude','longitude']].mean()

latitude     42.096262
longitude   -86.488917
dtype: float64

In [8]:
# look up an address
address_descriptor_result = gmaps.reverse_geocode((42.096262, -86.488917)) # , enable_address_descriptor=True

In [None]:
# Request reverse geocoding from google api
total_len = df['place_cluster_label'].nunique()
geocode_results = {}
for i, cluster_label in enumerate(list(df['place_cluster_label'].unique())):
    if i%50 == 0:
        print(f"{100*(i/total_len):.1f}% Complete")
    # reverse geocode the mean lat and lon of the cluster
    lat, lon = df[df['place_cluster_label'] == cluster_label][['latitude','longitude']].mean().values
    geocode_results[str(cluster_label)] = gmaps.reverse_geocode((lat, lon))
    time.sleep(.02) # to stay under the 3000 requests per minute ~ .02 sec per request

# Save Result immediately so we dont have to do it again
with open(DATAPATH+'cluster_geocoding.json','w+') as f:
    json.dump(geocode_results, f)

0.0% Complete
4.6040515653775325% Complete
9.208103130755065% Complete
13.812154696132598% Complete
18.41620626151013% Complete
23.02025782688766% Complete
27.624309392265197% Complete
32.22836095764273% Complete
36.83241252302026% Complete
41.43646408839779% Complete
46.04051565377532% Complete
50.64456721915286% Complete
55.24861878453039% Complete
59.852670349907925% Complete
64.45672191528546% Complete
69.06077348066299% Complete
73.66482504604052% Complete
78.26887661141805% Complete
82.87292817679558% Complete
87.47697974217311% Complete
92.08103130755065% Complete
96.68508287292818% Complete


In [126]:
# key = list(geocode_results.keys())[401]
parse = {}
for key in list(geocode_results.keys()):
    # location_types = [geocode_results[key][i]['geometry']['location_type']for i in range(len(geocode_results[key]))]
    for address in geocode_results[key]:
        loc_type = address['geometry']['location_type']        
        if loc_type in parse.keys():
            parse[loc_type].update(address['types'])
        else:
            parse[loc_type] = Counter(address['types'])
parse # looks like we can remove ['RANGE_INTERPOLATED', 'APPROXIMATE'] since the types don't add much value

{'ROOFTOP': Counter({'street_address': 1927,
          'establishment': 779,
          'point_of_interest': 779,
          'premise': 761,
          'subpremise': 450,
          'food': 192,
          'store': 143,
          'restaurant': 103,
          'lodging': 87,
          'airport': 71,
          'cafe': 57,
          'health': 50,
          'tourist_attraction': 37,
          'transit_station': 23,
          'bar': 19,
          'park': 18,
          'place_of_worship': 16,
          'hair_care': 16,
          'museum': 15,
          'bakery': 15,
          'parking': 15,
          'clothing_store': 11,
          'shopping_mall': 11,
          'grocery_or_supermarket': 10,
          'storage': 10,
          'subway_station': 10,
          'home_goods_store': 9,
          'travel_agency': 9,
          'liquor_store': 9,
          'jewelry_store': 6,
          'finance': 6,
          'supermarket': 6,
          'electronics_store': 5,
          'night_club': 5,
          'laundry'

In [152]:
for cluster_label in list(df['place_cluster_label'].unique()):
        if cluster_label == -3: # labelled as transit from above
                cluster_idx = df[df['place_cluster_label']==cluster_label].index
                df.loc[cluster_idx,'possible_tags'] = str(['transit'])
                df.loc[cluster_idx,'possible_place_ids'] = str(['none'])
                df.loc[cluster_idx,'possible_formatted_addresses'] = str(['none'])
        elif cluster_label == -1: # anything unlabelled
                cluster_idx = df[df['place_cluster_label']==cluster_label].index
                df.loc[cluster_idx,'possible_tags'] = str(['outlier'])
                df.loc[cluster_idx,'possible_place_ids'] = str(['none'])
                df.loc[cluster_idx,'possible_formatted_addresses'] = str(['none'])
        else:
                cluster_idx = df[df['place_cluster_label']==cluster_label].index
                # retrieve the address tags from geocode results for each cluster
                tags = [geocode_results[str(cluster_label)][i]['types'] for i in range(len(geocode_results[str(cluster_label)])) \
                        if geocode_results[str(cluster_label)][i]['geometry']['location_type'] not in ['RANGE_INTERPOLATED', 'APPROXIMATE']]
                tags = [item[i] for item in tags for i in range(len(item))]
                df.loc[cluster_idx,'possible_tags'] = str(tags)

                # Get possible place_ids
                place_ids = [geocode_results[str(cluster_label)][i]['place_id'] for i in range(len(geocode_results[str(cluster_label)])) \
                        if geocode_results[str(cluster_label)][i]['geometry']['location_type'] not in ['RANGE_INTERPOLATED', 'APPROXIMATE']]
                df.loc[cluster_idx,'possible_place_ids'] = str(place_ids)

                # get possible addresses
                addresses = [geocode_results[str(cluster_label)][i]['formatted_address'] for i in range(len(geocode_results[str(cluster_label)])) \
                        if geocode_results[str(cluster_label)][i]['geometry']['location_type'] not in ['RANGE_INTERPOLATED', 'APPROXIMATE']]
                df.loc[cluster_idx,'possible_formatted_addresses'] = str(addresses)
try:
        unnamed = [col for col in df.columns if 'unnamed' in col.lower()]
        df = df.drop(columns=['prev_latitude','prev_longitude','diff_lat','prev_diff_lat','diff_lon','prev_diff_lon'] + unnamed)      
except:
        donothing = 0          
df

Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision,datetime_delta,distance_delta,speed,place_cluster_label,bearing_degrees,direction_similarity,possible_tags,possible_place_ids,possible_formatted_addresses
3,John,06c5863b0ea97d00,1730101200939,2024-10-28 07:40:00,2024-10-28,07:40:00,42.096440,-86.488870,12.001812,12.001812,919.0,0.007415,0.029047,0,5.109931,-0.724454,"['premise', 'street_address', 'plus_code', 'ro...","['ChIJVanwJxvEEIgRwJ4UKzCpkfE', 'GhIJU22BTlIMR...","['536 La Salle Ave, St Joseph, MI 49085, USA',..."
4,John,06c5863b0ea97d00,1730102094916,2024-10-28 07:54:54,2024-10-28,07:54:54,42.096417,-86.488872,16.000000,16.000000,894.0,0.002556,0.010291,0,178.622805,-0.936471,"['premise', 'street_address', 'plus_code', 'ro...","['ChIJVanwJxvEEIgRwJ4UKzCpkfE', 'GhIJU22BTlIMR...","['536 La Salle Ave, St Joseph, MI 49085, USA',..."
5,John,06c5863b0ea97d00,1730102988904,2024-10-28 08:09:48,2024-10-28,08:09:48,42.096469,-86.488894,16.000000,16.000000,894.0,0.006095,0.024542,0,7.661282,-0.883735,"['premise', 'street_address', 'plus_code', 'ro...","['ChIJVanwJxvEEIgRwJ4UKzCpkfE', 'GhIJU22BTlIMR...","['536 La Salle Ave, St Joseph, MI 49085, USA',..."
6,John,06c5863b0ea97d00,1730104784777,2024-10-28 08:39:44,2024-10-28,08:39:44,42.096453,-86.489019,16.002417,16.002417,1796.0,0.010467,0.020981,0,112.607823,0.277553,"['premise', 'street_address', 'plus_code', 'ro...","['ChIJVanwJxvEEIgRwJ4UKzCpkfE', 'GhIJU22BTlIMR...","['536 La Salle Ave, St Joseph, MI 49085, USA',..."
7,John,06c5863b0ea97d00,1730105461973,2024-10-28 08:51:01,2024-10-28,08:51:01,42.096386,-86.488880,16.893646,16.893646,677.0,0.013721,0.072965,0,212.867165,-0.838858,"['premise', 'street_address', 'plus_code', 'ro...","['ChIJVanwJxvEEIgRwJ4UKzCpkfE', 'GhIJU22BTlIMR...","['536 La Salle Ave, St Joseph, MI 49085, USA',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44148,John,06c5863b0ea97d00,1748961399522,2025-06-03 14:36:39,2025-06-03,14:36:39,-34.574939,-58.440583,14.869825,14.869825,138.0,0.192026,5.009376,-1,238.837318,0.901037,['outlier'],['none'],['none']
44149,John,06c5863b0ea97d00,1748961565275,2025-06-03 14:39:25,2025-06-03,14:39:25,-34.576145,-58.438703,14.383521,14.383521,166.0,0.218227,4.732643,-1,237.325226,0.999652,['outlier'],['none'],['none']
44150,John,06c5863b0ea97d00,1748961719208,2025-06-03 14:41:59,2025-06-03,14:41:59,-34.577426,-58.438666,13.761205,13.761205,154.0,0.142432,3.329579,-1,181.639633,0.563664,['outlier'],['none'],['none']
44151,John,06c5863b0ea97d00,1748961811855,2025-06-03 14:43:31,2025-06-03,14:43:31,-34.577223,-58.439284,6.217190,6.217190,92.0,0.060945,2.384818,-1,71.871699,-0.338168,['outlier'],['none'],['none']


In [None]:
"""
Google Places API (Newer version of above)
- subject to more rate and usage limits
- need to use geocoding api to get 'place_id' anyway
"""
# [geocode_results[key][0]['place_id']] # each result has many possible place_id
# async def place_details(place_id):
#   client = places_v1.PlacesAsyncClient()
#   # Build the request
#   request = places_v1.GetPlaceRequest(
#       name=f"places/{place_id}",
#   )
#   # Set the field mask
#   fieldMask = "*"
#   # Make the request
#   response = await client.get_place(request=request, metadata=[("x-goog-fieldmask",fieldMask)])
#   return response
# place_deets = await place_details(place_id=place_id)

# Save Versioned Data

In [161]:
# save versioned data -- only uncomment when new version
version = len(list(Path('data/versioned').glob("*.csv")))
filename = f"tiledata_v{version}.csv"
df.to_csv(f"data/versioned/{filename}")
print(f"Successfully saved data: {filename}")

# save geocoding data
filename = f"tiledata_v{version}_cluster_geocoding.json"
with open(f"data/versioned/{filename}",'w+') as f:
    json.dump(geocode_results, f)
print(f"Successfully saved geocode: {filename}")

# Save versioned HDBSCAN model
with open(f"models/hdbscan_v{version}.pkl",'wb+') as f:
    pickle.dump(db, f)
print(f"Successfully saved model: hdbscan_v{version}.pkl")

Successfully saved data: tiledata_v0.csv
Successfully saved geocode: tiledata_v0_cluster_geocoding.json
Successfully saved model: hdbscan_v0.pkl


# Bottom