# Imports

In [1]:
# Third Party Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.graph_objects import Scattergeo
import cartopy.crs as ccrs
import cartopy
from anyascii import anyascii
from sqlalchemy import create_engine
from dotenv import load_dotenv
import imageio

# Native
import json
import time
import importlib
import os
import datetime

# Custom
# need to reimport everytime something changes
import data_utils.utils
importlib.reload(data_utils.utils)
from data_utils.utils import cluster_data, reduce_clusters

import data_utils.weather_api
importlib.reload(data_utils.weather_api)
from data_utils.weather_api import Weather_API

STAGEDDATAPATH = r'..\data\staged\\'

hello from data_utils/__init__.py


In [7]:
df = pd.DataFrame({'tag':['a','b'], 'total':[100,200], 'delta':[10,20]}, index=[0,1])
df.melt('tag')

Unnamed: 0,tag,variable,value
0,a,total,100
1,b,total,200
2,a,delta,10
3,b,delta,20


# Load Data

In [111]:
df = pd.read_csv(STAGEDDATAPATH + 'addresses.csv')
with open(STAGEDDATAPATH + 'geocode_results.json', 'r') as f:
    geocode_results = json.load(f)
# df['count'] = 1
df = df.drop(columns=['Unnamed: 0'])
df.head(10)

Unnamed: 0,cluster_label,address
0,-3,none
1,-1,none
2,2778,"536 La Salle Ave, St Joseph, MI 49085, USA"
3,2778,"3GW6+HC St. Joseph, MI, USA"
4,2778,"599-501 La Salle Ave, St Joseph, MI 49085, USA"
5,2794,"536 La Salle Ave, St Joseph, MI 49085, USA"
6,2794,"3GW6+HC St. Joseph, MI, USA"
7,2794,"599-501 La Salle Ave, St Joseph, MI 49085, USA"
8,2310,"534 La Salle Ave, St Joseph, MI 49085, USA"
9,2310,"599-501 La Salle Ave, St Joseph, MI 49085, USA"


In [168]:
# Mapping address to most frequent cluster_label
tdf = df.groupby('address')['cluster_label'].agg(pd.Series.mode).to_frame().reset_index()
tdf.rename(columns={'cluster_label': 'norm_cluster_label'}, inplace=True)
tdf = tdf.explode('norm_cluster_label')
# This adds a 'first_cluster_label' column to each row in df
df_with_both_labels = pd.merge(df, tdf, on='address', how='left')

# Convert to dictionary
cluster_map = pd.Series(df_with_both_labels['norm_cluster_label'].values, index=df_with_both_labels['cluster_label']).to_dict()
cluster_map[-1] = -1
cluster_map[-3] = -3

In [170]:
cdf = pd.read_csv(STAGEDDATAPATH + 'cluster_address.csv')
cdf['new_cluster_label'] = cdf['cluster_label'].map(cluster_map)
cdf
# print(f"Original Clusters: {df['cluster_label'].nunique()}\nNew Clusters: {cdf['cluster_label'].nunique()}")

Unnamed: 0.1,Unnamed: 0,cluster_label,administrative_area_level_1,administrative_area_level_2,administrative_area_level_3,administrative_area_level_4,street_number,route,neighborhood,locality,country,postal_code,postal_code_suffix,plus_code,norm_cluster_label,new_cluster_label
0,0,2778,Michigan,Berrien County,,,536,La Salle Avenue,,St. Joseph,United States,49085,1631,,,2844
1,1,2794,Michigan,Berrien County,,,536,La Salle Avenue,,St. Joseph,United States,49085,1631,,,2844
2,2,2310,Michigan,Berrien County,,,534,La Salle Avenue,,St. Joseph,United States,49085,1631,,,2815
3,3,2790,Michigan,Berrien County,,,536,La Salle Avenue,,St. Joseph,United States,49085,1631,,,2844
4,4,-1,,,,,,,,,,,,,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2104,2104,10,Ciudad Autonoma de Buenos Aires,Comuna 14,,,5162,Avenida Santa Fe,Palermo Hollywood,Buenos Aires,Argentina,C1425,,,,10
2105,2105,1613,Ciudad Autonoma de Buenos Aires,Comuna 14,,,37,Amenabar,,Buenos Aires,Argentina,C1426,AIA,,,1681
2106,2106,1678,Ciudad Autonoma de Buenos Aires,Comuna 14,,,37,Amenabar,,Buenos Aires,Argentina,C1426,AIA,,,1681
2107,2107,1679,Ciudad Autonoma de Buenos Aires,Comuna 14,,,37,Amenabar,,Buenos Aires,Argentina,C1426,AIA,,,1681


# Extract City, Country from addresses

In [9]:
cdf = df.copy()
cdf.groupby('address').first().reset_index()

Unnamed: 0.1,address,Unnamed: 0,cluster_label
0,"0-1-12, Khu văn phòng Tầng 1 Nhà ga hành khách...",4736,1586
1,"01 Fansipan, TT. Sa Pa, Sa Pa, Lào Cai 19000, ...",5307,1461
2,"01 Fansipan, TT. Sa Pa, Sa Pa, Lào Cai, Vietnam",5308,1461
3,"01 Lê Duẩn, Phú Hoà, Huế, Thành phố Huế, Vietnam",3868,1413
4,"01 Lê Đình Thám, Cẩm Sơn, Hội An, Quảng Nam, V...",4331,2213
...,...,...,...
3607,"百老匯美食街，H-G004-G005, Macao",7956,226
3608,"金鐘金鐘道88號太古廣場二座LG1地, 庫009號舖, 88 Queensway, Admi...",7793,1385
3609,"산1-2 yongsan-dong 2(i)-ga, Yongsan District, S...",727,907
3610,"산132-1 Bulgwang-dong, Eunpyeong District, Seou...",751,123


In [48]:
loc_tags = ['administrative_area_level_1',
 'administrative_area_level_2',
 'administrative_area_level_3',
 'administrative_area_level_4',
 'street_number',
 'route',
 'neighborhood',
 'locality',
 'country',
 'postal_code',
 'postal_code_suffix',
 'plus_code']
key = list(geocode_results.keys())[1000]
start = time.time()
loc_info = {comp['types'][0] : comp['long_name'] for comp in geocode_results[key][0]['address_components'] if comp['types'][0] in loc_tags}
loc_info = {key: loc_info[key] if key in loc_info else np.nan for key in loc_tags}
print(f"Took {time.time() - start} seconds")
loc_info 

Took 0.0008561611175537109 seconds


{'administrative_area_level_1': 'Quảng Nam',
 'administrative_area_level_2': nan,
 'administrative_area_level_3': nan,
 'administrative_area_level_4': nan,
 'street_number': '102',
 'route': 'Đường Bạch Đằng',
 'neighborhood': nan,
 'locality': 'Thành phố Hội An',
 'country': 'Vietnam',
 'postal_code': nan,
 'postal_code_suffix': nan,
 'plus_code': nan}

In [None]:
key = list(geocode_results.keys())[1000]
start = time.time()
# First, parse the relevant components into a temporary dictionary for quick lookups
parsed_components = {
    comp['types'][0]: comp['long_name']
    for comp in geocode_results[key][0]['address_components']
    if comp['types'][0] in loc_tags
}

# Now, build the final loc_info dictionary using the loc_tags as keys
# and getting values from parsed_components, defaulting to np.nan if not found
loc_info = {
    tag: parsed_components.get(tag, np.nan)
    for tag in loc_tags
}
print(f"Took {time.time() - start} seconds")
loc_info

TypeError: 'float' object is not iterable

In [56]:
# First, parse the relevant components into a temporary dictionary for quick lookups
parsed_components = {}
for comp in geocode_results[key][0]['address_components']:
    component_type = comp['types'][0]
    if component_type in loc_tags:
        long_name_value = comp['long_name']
        
        # Check if the value is not None and convert it to string
        if long_name_value is not None:
            # Use str() to convert integers, floats, etc., to strings
            parsed_components[component_type] = anyascii(str(long_name_value))
        else:
            # If it's None, you might want to keep it as None or treat it as NaN
            parsed_components[component_type] = np.nan # Or None, depending on preference

# Now, build the final loc_info dictionary
loc_info = {
    tag: parsed_components.get(tag, np.nan)
    for tag in loc_tags
}

loc_info

{'administrative_area_level_1': 'Quang Nam',
 'administrative_area_level_2': nan,
 'administrative_area_level_3': nan,
 'administrative_area_level_4': nan,
 'street_number': '102',
 'route': 'Duong Bach Dang',
 'neighborhood': nan,
 'locality': 'Thanh pho Hoi An',
 'country': 'Vietnam',
 'postal_code': nan,
 'postal_code_suffix': nan,
 'plus_code': nan}

In [53]:
types = set()
for key in geocode_results.keys():
    for comp in geocode_results[key][0]['address_components']:
        types.update([comp['types'][0]])
# types

# Weather API

In [21]:
df = pd.read_csv(STAGEDDATAPATH + 'tile_data_John.csv')
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tile_name,tile_uuid,location_timestamp,datetime,date,time,latitude,longitude,raw_precision,precision,cluster_label,bearing,direction_similarity,norm_cluster_label
0,0,0,John,06c5863b0ea97d00,1730098465999,2024-10-28 06:54:25.999000+00:00,2024-10-28,06:54:25,42.096405,-86.488891,12.001812,12.001812,2778,,,2844
1,1,1,John,06c5863b0ea97d00,1730098924925,2024-10-28 07:02:04.925000+00:00,2024-10-28,07:02:04,42.096386,-86.488880,16.893646,16.893646,2794,156.676291,,2844
2,2,2,John,06c5863b0ea97d00,1730098936999,2024-10-28 07:02:16.999000+00:00,2024-10-28,07:02:16,42.096405,-86.488891,12.001812,12.001812,2778,336.676298,-1.000000,2844
3,3,3,John,06c5863b0ea97d00,1730099378101,2024-10-28 07:09:38.101000+00:00,2024-10-28,07:09:38,42.096461,-86.488998,24.000000,24.000000,2310,304.928871,0.843880,2815
4,4,4,John,06c5863b0ea97d00,1730100281000,2024-10-28 07:24:41+00:00,2024-10-28,07:24:41,42.096375,-86.488851,8.001208,8.001208,2790,128.250727,-0.998689,2844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65593,65593,65593,John,06c5863b0ea97d00,1749045577311,2025-06-04 13:59:37.311000+00:00,2025-06-04,13:59:37,-34.577209,-58.439284,6.217190,6.217190,1210,0.000000,0.000000,1211
65594,65594,65594,John,06c5863b0ea97d00,1749046410085,2025-06-04 14:13:30.085000+00:00,2025-06-04,14:13:30,-34.577209,-58.439284,6.217190,6.217190,1210,0.000000,0.000000,1211
65595,65595,65595,John,06c5863b0ea97d00,1749046569985,2025-06-04 14:16:09.985000+00:00,2025-06-04,14:16:09,-34.577223,-58.439284,6.217190,6.217190,1211,180.000000,0.000000,1681
65596,65596,65596,John,06c5863b0ea97d00,1749046840618,2025-06-04 14:20:40.618000+00:00,2025-06-04,14:20:40,-34.577223,-58.439284,6.217190,6.217190,1211,0.000000,0.000000,1681


In [131]:
weather = Weather_API()
smalldf = df[df['date']<'2024-11-02'][['date','time','latitude','longitude']].copy()
weather.get_weather(smalldf)
weather.weather_df

Unnamed: 0,date,time,weather_hour,latitude,longitude,elevation_meters_asl,temperature_2m,relative_humidity_2m,rain,weather_code,wind_speed_10m,cloud_cover,cloud_cover_low,cloud_cover_high,cloud_cover_mid,is_day,sunshine_duration,precipitation,snowfall,apparent_temperature,pressure_msl,surface_pressure,wind_direction_10m,wind_gusts_10m
0,2024-10-28,06:54:25,2024-10-28 06:00:00,42.096405,-86.488891,205.0,6.9655,64.447319,0.0,1.0,14.348156,28.0,0.0,28.0,0.0,0.0,0.000000,0.0,0.0,2.861774,1023.700012,998.482300,160.201035,23.400000
1,2024-10-28,07:02:04,2024-10-28 07:00:00,42.096386,-86.488880,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
2,2024-10-28,07:02:16,2024-10-28 07:00:00,42.096405,-86.488891,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
3,2024-10-28,07:09:38,2024-10-28 07:00:00,42.096461,-86.488998,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
4,2024-10-28,07:24:41,2024-10-28 07:00:00,42.096375,-86.488851,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,2024-11-01,18:06:50,2024-11-01 18:00:00,42.096386,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
703,2024-11-01,18:09:51,2024-11-01 18:00:00,42.096386,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
704,2024-11-01,18:19:32,2024-11-01 18:00:00,42.096375,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
705,2024-11-01,18:22:02,2024-11-01 18:00:00,42.096375,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000


In [133]:
weather.responses

{'2024-10-28': <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x1599ed76a70>,
 '2024-10-29': <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x1599b263760>,
 '2024-10-30': <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x1599e234400>,
 '2024-10-31': <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x159a7d817b0>,
 '2024-11-01': <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x159a7d81de0>}

In [None]:
smalldf['hour'] = pd.to_datetime(smalldf['time'], format="%H:%M:%S").dt.hour
smalldf['weather_hour'] = pd.to_datetime(smalldf['date'].astype(str) + ' ' + smalldf['hour'].astype(str), format="%Y-%m-%d %H")
smalldf = pd.merge(smalldf, weather.hourly_df, how='left', left_on='weather_hour', right_on='date_hour', suffixes=[None,'_right'])
remove_cols = ['date_hour', 'datetime','hour'] + [col for col in smalldf if '_right' in col.lower()]
smalldf = smalldf.drop(columns=remove_cols)
col_order = ['date','time','weather_hour','latitude','longitude']
col_order = col_order + [col for col in smalldf.columns if col not in col_order]
smalldf = smalldf[col_order]
smalldf

Unnamed: 0,date,time,weather_hour,latitude,longitude,elevation_meters_asl,temperature_2m,relative_humidity_2m,rain,weather_code,wind_speed_10m,cloud_cover,cloud_cover_low,cloud_cover_high,cloud_cover_mid,is_day,sunshine_duration,precipitation,snowfall,apparent_temperature,pressure_msl,surface_pressure,wind_direction_10m,wind_gusts_10m
0,2024-10-28,06:54:25,2024-10-28 06:00:00,42.096405,-86.488891,205.0,6.9655,64.447319,0.0,1.0,14.348156,28.0,0.0,28.0,0.0,0.0,0.000000,0.0,0.0,2.861774,1023.700012,998.482300,160.201035,23.400000
1,2024-10-28,07:02:04,2024-10-28 07:00:00,42.096386,-86.488880,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
2,2024-10-28,07:02:16,2024-10-28 07:00:00,42.096405,-86.488891,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
3,2024-10-28,07:09:38,2024-10-28 07:00:00,42.096461,-86.488998,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
4,2024-10-28,07:24:41,2024-10-28 07:00:00,42.096375,-86.488851,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.000000,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,2024-11-01,18:06:50,2024-11-01 18:00:00,42.096386,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
703,2024-11-01,18:09:51,2024-11-01 18:00:00,42.096386,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
704,2024-11-01,18:19:32,2024-11-01 18:00:00,42.096375,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000
705,2024-11-01,18:22:02,2024-11-01 18:00:00,42.096375,-86.488880,205.0,8.6655,70.084183,0.0,3.0,13.501200,100.0,100.0,0.0,0.0,1.0,0.000000,0.0,0.0,5.179919,1026.500000,1001.363647,317.161102,29.160000


In [36]:
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

Coordinates 42.07381057739258°N -86.45773315429688°E
Elevation 205.0 m asl
Timezone NoneNone
Timezone difference to GMT+0 0 s


In [93]:
# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_rain = hourly.Variables(2).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(3).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(4).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(6).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(7).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(8).ValuesAsNumpy()
hourly_is_day = hourly.Variables(9).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(10).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(11).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(12).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(13).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(14).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(15).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(17).ValuesAsNumpy()

hourly_data = {"datetime": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data['date_hour'] = pd.to_datetime(hourly_data['datetime'].date.astype(str) + ' ' +  hourly_data['datetime'].hour.astype(str), format="%Y-%m-%d %H")
hourly_data['date'] = hourly_data['datetime'].date
hourly_data['hour'] = hourly_data['datetime'].hour # addition to map to hours
hourly_data['elevation_meters_asl'] = response.Elevation()
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["rain"] = hourly_rain
hourly_data["weather_code"] = hourly_weather_code
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration
hourly_data["precipitation"] = hourly_precipitation
hourly_data["snowfall"] = hourly_snowfall
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m

hourly_df = pd.DataFrame(data = hourly_data)
hourly_df.head(3)

Unnamed: 0,datetime,date_hour,date,hour,elevation_meters_asl,temperature_2m,relative_humidity_2m,rain,weather_code,wind_speed_10m,cloud_cover,cloud_cover_low,cloud_cover_high,cloud_cover_mid,is_day,sunshine_duration,precipitation,snowfall,apparent_temperature,pressure_msl,surface_pressure,wind_direction_10m,wind_gusts_10m
0,2024-10-28 00:00:00+00:00,2024-10-28 00:00:00,2024-10-28,0,205.0,9.1655,67.520706,0.0,0.0,8.825508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.352202,1024.800049,999.749146,168.231735,14.04
1,2024-10-28 01:00:00+00:00,2024-10-28 01:00:00,2024-10-28,1,205.0,7.2155,77.074921,0.0,0.0,9.568824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.293655,1024.5,999.284912,163.610382,13.32
2,2024-10-28 02:00:00+00:00,2024-10-28 02:00:00,2024-10-28,2,205.0,7.8655,67.482475,0.0,0.0,8.431228,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.878764,1024.5,999.342163,163.886505,13.679999


Unnamed: 0,date,time,weather_hour,latitude,longitude,elevation_meters_asl,temperature_2m,relative_humidity_2m,rain,weather_code,wind_speed_10m,cloud_cover,cloud_cover_low,cloud_cover_high,cloud_cover_mid,is_day,sunshine_duration,precipitation,snowfall,apparent_temperature,pressure_msl,surface_pressure,wind_direction_10m,wind_gusts_10m
0,2024-10-28,06:54:25,2024-10-28 06:00:00,42.096405,-86.488891,205.0,6.9655,64.447319,0.0,1.0,14.348156,28.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,2.861774,1023.700012,998.482300,160.201035,23.400000
1,2024-10-28,07:02:04,2024-10-28 07:00:00,42.096386,-86.488880,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
2,2024-10-28,07:02:16,2024-10-28 07:00:00,42.096405,-86.488891,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
3,2024-10-28,07:09:38,2024-10-28 07:00:00,42.096461,-86.488998,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
4,2024-10-28,07:24:41,2024-10-28 07:00:00,42.096375,-86.488851,205.0,6.7155,64.622864,0.0,3.0,15.244842,98.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,2.448909,1023.500000,998.265259,157.067871,25.199999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65593,2025-06-04,13:59:37,2025-06-04 13:00:00,-34.577209,-58.439284,,,,,,,,,,,,,,,,,,,
65594,2025-06-04,14:13:30,2025-06-04 14:00:00,-34.577209,-58.439284,,,,,,,,,,,,,,,,,,,
65595,2025-06-04,14:16:09,2025-06-04 14:00:00,-34.577223,-58.439284,,,,,,,,,,,,,,,,,,,
65596,2025-06-04,14:20:40,2025-06-04 14:00:00,-34.577223,-58.439284,,,,,,,,,,,,,,,,,,,


# Plotting

In [None]:
# try different dbscan model
metric = 'haversine'
min_cluster_size = 8
_, df['cluster_new'] = cluster_data(df[['latitude','longitude']], metric, min_cluster_size)

prev_len = df['cluster_new'].nunique()
df['new_reduced_clusters'] = reduce_clusters(df=df)
print(f"{len(df[df['cluster_new']==-3])} points labelled as transit (-3)")
print(f"reduced clusters by {prev_len - df['cluster_new'].nunique()} from {prev_len} to {df['cluster_new'].nunique()}")

7921 points labelled as transit (-3)
reduced clusters by 408 from 1480 to 1072


In [9]:
plot_col = 'cluster_label' # cluster_label cluster_new
oneday = df[df.date.isin(list(df.date.unique())[-60:-20])] # -110:-90 , -60:-40
oneday = oneday[oneday[plot_col] > -1]
oneday.loc[:,'color'] = 'red'

fig = make_subplots(rows=1, cols=2) 
fig = px.scatter_geo(oneday, 
                     lat="latitude",
                     lon='longitude',
                     color=plot_col,
                     scope='world',
                     fitbounds='locations',
                     hover_data=['direction_similarity']
                    )
bounds_add = 10
fig.add_traces(Scattergeo(lat=[oneday.latitude.min()-bounds_add, oneday.latitude.max()+bounds_add],
                                               lon=[oneday.longitude.min()-bounds_add, oneday.longitude.max()+bounds_add],
                                               mode = 'markers', marker = dict(size = 2,color = 'rgba(0, 0, 0, 0)')))
fig.update_geos(resolution=50)


fig.show()

In [158]:
plotdf = pd.read_csv(STAGEDDATAPATH + 'tile_data_john.csv')
plotdf['cluster_label_norm'] = plotdf['cluster_label'].map(cluster_map)

In [159]:
plotdf=plotdf.groupby('cluster_label_norm')[['latitude','longitude']].mean().reset_index()

In [160]:
# plot_col = 'cluster_label' # cluster_label cluster_new
# oneday = df[df.date.isin(list(df.date.unique())[-60:-20])] # -110:-90 , -60:-40
# oneday = oneday[oneday[plot_col] > -1]
# oneday.loc[:,'color'] = 'red'

fig = make_subplots(rows=1, cols=2) 
fig = px.scatter_geo(plotdf, 
                     lat="latitude",
                     lon='longitude',
                     color='cluster_label_norm',
                     scope='world',
                     fitbounds='locations',
                    #  hover_data=['direction_similarity']
                    )
bounds_add = 10
fig.add_traces(Scattergeo(lat=[plotdf.latitude.min()-bounds_add, plotdf.latitude.max()+bounds_add],
                                               lon=[plotdf.longitude.min()-bounds_add, plotdf.longitude.max()+bounds_add],
                                               mode = 'markers', marker = dict(size = 2,color = 'rgba(0, 0, 0, 0)')))
fig.update_geos(resolution=50)


fig.show()

# Folium Heatmap

In [115]:
def create_folium_heatmap(df: pd.DataFrame) -> folium.Map:
    """
    Generates an interactive heatmap using the Folium library from a Pandas DataFrame.
    The map's initial zoom and center are automatically set to fit the data.

    Args:
        df (pd.DataFrame): A Pandas DataFrame expected to contain the following columns:
                           - 'latitude': Numerical column for latitude coordinates.
                           - 'longitude': Numerical column for longitude coordinates.
                           - 'datetime': (Optional, not directly used for heatmap intensity but good for context)
                                         Datetime column.

    Returns:
        folium.Map: A Folium Map object with the heatmap layer added.
                    The map can be saved to an HTML file or displayed in a Jupyter notebook.

    Raises:
        ValueError: If 'latitude' or 'longitude' columns are missing from the DataFrame.
    """
    # Validate required columns
    required_columns = ['latitude', 'longitude']
    if not all(col in df.columns for col in required_columns):
        missing_cols = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"DataFrame must contain the following columns: {required_columns}. Missing: {missing_cols}")

    # Prepare data for heatmap: list of [latitude, longitude] pairs
    # If you wanted to add intensity, you could add a third element [lat, lon, intensity]
    heat_data = [[row['latitude'], row['longitude']] for index, row in df.iterrows()]

    # Calculate the bounds of the data for automatic zooming
    min_lat, max_lat = df['latitude'].min(), df['latitude'].max()
    min_lon, max_lon = df['longitude'].min(), df['longitude'].max()
    bounds = [[min_lat, min_lon], [max_lat, max_lon]]

    # Create a base Folium map, setting the initial view to fit the data bounds
    # The 'fit_bounds' parameter automatically calculates the best center and zoom level
    m = folium.Map(tiles="OpenStreetMap") # Using default tiles, can specify others like 'Stamen Terrain'

    # Add the HeatMap layer to the map
    # You can customize parameters like radius, blur, min_opacity, max_zoom
    HeatMap(heat_data).add_to(m)

    # Fit the map to the calculated bounds after adding the heatmap
    # This ensures the map view is adjusted to show all data points
    m.fit_bounds(bounds)

    return m

In [None]:
create_folium_heatmap(cdf)

# Dashboard Plots

In [10]:
# PostgreSQL credentials and database details
load_dotenv() # take environment variables from .env.
db_user = os.getenv("POSTGRESQL_USERNAME")
db_password = os.getenv("POSTGRESQL_PWD")
db_host = 'localhost' # Or your PostgreSQL server IP/hostname
db_port = '5432'      # Default PostgreSQL port
db_name = 'tile_db'

In [None]:
start = datetime.date(2024, 11, 16)
end = datetime.date(2024, 11, 18)
query = f"""
SELECT
    datetime,
    latitude,
    longitude
FROM
    tile_data_john
WHERE
    date BETWEEN '{start}' AND '{end}'
;
"""
engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')
df = pd.read_sql(query, con = engine)
df['datetime'] = pd.to_datetime(df['datetime'].str.replace('+00:00',''), format="%Y-%m-%d %H:%M:%S.%f", errors='coerce')
df = df.dropna(axis=0)
df

Unnamed: 0,datetime,latitude,longitude
0,2024-11-16 01:10:12.476,37.569943,126.976895
1,2024-11-16 01:10:38.478,37.570463,126.976998
2,2024-11-16 01:46:04.506,37.572083,126.988201
3,2024-11-16 01:48:08.329,37.572144,126.988293
4,2024-11-16 01:50:29.019,37.572083,126.988170
...,...,...,...
648,2024-11-18 23:39:29.822,37.572083,126.988158
649,2024-11-18 23:41:36.579,37.572097,126.988158
650,2024-11-18 23:43:41.114,37.572166,126.988199
651,2024-11-18 23:45:44.428,37.572097,126.988158


In [55]:
df.set_index('datetime').resample('15 min').mean().resample('15 min').interpolate(method='index')

Unnamed: 0_level_0,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-11-16 01:00:00,37.570203,126.976946
2024-11-16 01:15:00,37.570834,126.980696
2024-11-16 01:30:00,37.571466,126.984446
2024-11-16 01:45:00,37.572097,126.988196
2024-11-16 02:00:00,37.572091,126.988170
...,...,...
2024-11-18 22:45:00,37.572097,126.988158
2024-11-18 23:00:00,37.572096,126.988158
2024-11-18 23:15:00,37.572090,126.988158
2024-11-18 23:30:00,37.572108,126.988166


## Gif with Plotly

In [56]:
fig = px.scatter_map(df,
                        lat="latitude",
                        lon="longitude",
                        animation_frame="datetime", # This is the animation controller
                        # color="point_type", # Differentiate paths from static dots
                        color_discrete_map={
                            'path': 'sienna',
                            'city_dot': 'darkred'
                        },
                        size_max=8, # Max size for dots
                        zoom=7,      # Adjust zoom level for your area
                        height=600,
                        map_style="carto-positron", # Or "open-street-map", "satellite-streets", etc.
                                                        # For satellite, you might need a Mapbox token.
                        # opacity="opacity", # px.scatter_mapbox doesn't have a direct opacity mapping for animation
                        # You'd typically control opacity via `custom_data` or `frame_by_frame` building with go.Figure

                        # To achieve fading lines with px.scatter_mapbox, you'd need to:
                        # 1. Create a "path" column that groups points into segments for the line plot.
                        # 2. Add an 'opacity' column, and hope px honors it in animation (it might not directly for lines).
                        # A better way for fading lines is often with `go.Figure` and `add_trace` within frames.
                       )
fig.show()

In [None]:
# Update layout for better map appearance
fig.update_layout(
    mapbox_accesstoken="YOUR_MAPBOX_TOKEN", # Replace with your actual Mapbox token for satellite maps
    # If not using Mapbox, comment out the token and use 'open-street-map' or similar style
    mapbox_bounds={"west": -73.8, "east": -72.5, "south": -42.5, "north": -41.6}, # Set initial map bounds
    margin={"r":0,"t":0,"l":0,"b":0},
    hovermode="closest",
    # Add animation settings
    updatemenus=[dict(type="buttons",
                      buttons=[dict(label="Play", method="animate", args=[None])])]
)

# Customizing line appearance (opacity) with px requires a bit more work or using go.Figure.
# With px.scatter_mapbox, the 'color' argument primarily determines line color if `line_group` is used.
# To get the fading, you might need to iterate through frames and explicitly set line properties in `fig.frames`.

# If the `opacity` column doesn't directly translate to line opacity in px.scatter_mapbox during animation,
# you would need to use `plotly.graph_objects` and manually define `data` and `layout` for each `frame`.

# Example of how to add fixed red dots (if not integrated into the main animation_df)
# For static elements, you can add them to the initial figure
fig.add_trace(
    px.scatter_mapbox(fixed_cities,
                      lat="latitude",
                      lon="longitude",
                      color_discrete_sequence=['darkred'],
                      size_max=8,
                      opacity=1.0,
                      ).data[0] # Get the first trace from the px figure
)

# Output the plot
fig.show()

# To save as HTML:
# fig.write_html("travel_animation_plotly.html")

# To save as GIF (requires kaleido):
# pip install kaleido
# fig.write_image("travel_animation_plotly.gif", engine="kaleido", scale=2) # scale increases resolution

## Gif with Cartopy

In [None]:
# Define the time range for the animation
start_time = df['datetime'].min()
end_time = df['datetime'].max()
time_step = datetime.timedelta(minutes=15) # Adjust for desired animation smoothness
fading_window = datetime.timedelta(hours=6) # How long a path segment remains visible and fades

frames = []
current_time = start_time

while current_time <= end_time:
    fig = plt.figure(figsize=(10, 8))
    # Using Plate Carree for simplicity, but consider more appropriate projections for your region
    ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

    # Set map extent (adjust to your data's geographic range)
    ax.set_extent([126.7, 127.3, 37.3, 37.7], crs=ccrs.PlateCarree()) # Example extent

    # Add features (coastlines, land, ocean - adjust colors as needed)
    ax.add_feature(cartopy.feature.LAND, facecolor='lightgrey', edgecolor='black')
    ax.add_feature(cartopy.feature.OCEAN, facecolor='lightblue') # Or match the GIF's ocean color

    # Plot static red dots for city locations (or key points)
    # for city_name, coords in city_locations.items():
    #     ax.plot(coords['lon'], coords['lat'], 'o', color='darkred', markersize=4, transform=ccrs.PlateCarree())

    # Plot paths with fading effect
    # for city_id in df['city'].unique():
    #     city_df = df[df['city'] == city_id].sort_values(by='timestamp')

        # Filter data for the current time window for paths
        # This is a simplification; a more robust fading would involve drawing multiple segments
        # with decreasing alpha based on their age relative to current_time
    visible_path_df = df[
        (df['datetime'] <= current_time) &
        (df['datetime'] > current_time - fading_window)
    ]

    if not visible_path_df.empty:
        # For a fading effect, you might need to iterate through segments or create a gradient
        # This example just shows plotting the recent path
        ax.plot(visible_path_df['longitude'], visible_path_df['latitude'],
                color='sienna', linewidth=2, alpha=0.8, transform=ccrs.PlateCarree())

        # A more advanced fading: Iterate through segments and apply varying alpha
        # For each segment in visible_path_df:
        #   Calculate age = current_time - segment_timestamp
        #   alpha = 1.0 - (age / fading_window)
        #   Plot segment with calculated alpha and a color that shifts (e.g., from dark red to light orange)

    # Add current timestamp
    ax.text(0.02, 0.95, current_time.strftime('%Y-%m-%d %H:%M:%S'), transform=ax.transAxes,
            fontsize=12, color='black', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

    # Add scale bar (requires matplotlib_scalebar) - Install with: pip install matplotlib-scalebar
    # from matplotlib_scalebar.scalebar import ScaleBar
    # ax.add_artist(ScaleBar(1, units='km', location='lower left', frameon=False,
    #                        color='black', box_alpha=0.8, font_properties={'size': 10}))

    # Add North Arrow (manual example)
    ax.text(0.95, 0.05, 'N\n▲', transform=ax.transAxes, fontsize=14, ha='center', va='bottom', color='black')

    plt.title('') # No title as in the GIF
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True, linestyle='--', alpha=0.6)

    # Save the frame
    frame_filename = f"frame_{current_time.strftime('%Y%m%d%H%M%S')}.png"
    plt.savefig(f"../data/gif_files/{frame_filename}", dpi=100)
    frames.append(frame_filename)
    plt.close(fig) # Close the figure to free memory

    current_time += time_step


Creating GIF...






FileNotFoundError: No such file: 'c:\Users\joyam\Documents\JohnProjects\tile_project\data_handling\frame_20241116011012.png'

In [None]:

# 5. Create the GIF
print("Creating GIF...")
with imageio.get_writer('../data/gif_files/travel_animation.gif', mode='I', duration=0.1) as writer: # duration in seconds per frame
    for frame_file in frames:
        image = imageio.imread(f"../data/gif_files/{frame_file}")
        writer.append_data(image)
print("GIF created: travel_animation.gif")


Creating GIF...






GIF created: travel_animation.gif


In [None]:

for frame_file in frames:
    os.remove(frame_file)

# Bottom