# Imports

In [57]:
# Third Party Imports
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.graph_objects import Scattergeo
from anyascii import anyascii

# Native
import json
import time

# Custom
# Custom Functions
import importlib
import data_utils.utils
importlib.reload(data_utils.utils)
from data_utils.utils import cluster_data, reduce_clusters

STAGEDDATAPATH = r'..\data\staged\\'

# Load Data

In [111]:
df = pd.read_csv(STAGEDDATAPATH + 'addresses.csv')
with open(STAGEDDATAPATH + 'geocode_results.json', 'r') as f:
    geocode_results = json.load(f)
# df['count'] = 1
df = df.drop(columns=['Unnamed: 0'])
df.head(10)

Unnamed: 0,cluster_label,address
0,-3,none
1,-1,none
2,2778,"536 La Salle Ave, St Joseph, MI 49085, USA"
3,2778,"3GW6+HC St. Joseph, MI, USA"
4,2778,"599-501 La Salle Ave, St Joseph, MI 49085, USA"
5,2794,"536 La Salle Ave, St Joseph, MI 49085, USA"
6,2794,"3GW6+HC St. Joseph, MI, USA"
7,2794,"599-501 La Salle Ave, St Joseph, MI 49085, USA"
8,2310,"534 La Salle Ave, St Joseph, MI 49085, USA"
9,2310,"599-501 La Salle Ave, St Joseph, MI 49085, USA"


In [None]:
# Mapping address to most frequent cluster_label
tdf = df.groupby('address')['cluster_label'].agg(pd.Series.mode).to_frame().reset_index()
tdf.rename(columns={'cluster_label': 'norm_cluster_label'}, inplace=True)
first_labels_df = tdf.explode('norm_cluster_label')
# This adds a 'first_cluster_label' column to each row in df
df_with_both_labels = pd.merge(df, tdf, on='address', how='left')

# Convert to dictionary
cluster_map = pd.Series(df_with_both_labels['norm_cluster_label'].values, index=df_with_both_labels['cluster_label']).to_dict()
cluster_map[-1] = -1
cluster_map[-3] = -3

In [157]:
cdf = df.copy()
cdf['cluster_label'] = cdf['cluster_label'].map(cluster_map)
# cdf
print(f"Original Clusters: {df['cluster_label'].nunique()}\nNew Clusters: {cdf['cluster_label'].nunique()}")

Original Clusters: 2109
New Clusters: 695


# Extract City, Country from addresses

In [9]:
cdf = df.copy()
cdf.groupby('address').first().reset_index()

Unnamed: 0.1,address,Unnamed: 0,cluster_label
0,"0-1-12, Khu văn phòng Tầng 1 Nhà ga hành khách...",4736,1586
1,"01 Fansipan, TT. Sa Pa, Sa Pa, Lào Cai 19000, ...",5307,1461
2,"01 Fansipan, TT. Sa Pa, Sa Pa, Lào Cai, Vietnam",5308,1461
3,"01 Lê Duẩn, Phú Hoà, Huế, Thành phố Huế, Vietnam",3868,1413
4,"01 Lê Đình Thám, Cẩm Sơn, Hội An, Quảng Nam, V...",4331,2213
...,...,...,...
3607,"百老匯美食街，H-G004-G005, Macao",7956,226
3608,"金鐘金鐘道88號太古廣場二座LG1地, 庫009號舖, 88 Queensway, Admi...",7793,1385
3609,"산1-2 yongsan-dong 2(i)-ga, Yongsan District, S...",727,907
3610,"산132-1 Bulgwang-dong, Eunpyeong District, Seou...",751,123


In [48]:
loc_tags = ['administrative_area_level_1',
 'administrative_area_level_2',
 'administrative_area_level_3',
 'administrative_area_level_4',
 'street_number',
 'route',
 'neighborhood',
 'locality',
 'country',
 'postal_code',
 'postal_code_suffix',
 'plus_code']
key = list(geocode_results.keys())[1000]
start = time.time()
loc_info = {comp['types'][0] : comp['long_name'] for comp in geocode_results[key][0]['address_components'] if comp['types'][0] in loc_tags}
loc_info = {key: loc_info[key] if key in loc_info else np.nan for key in loc_tags}
print(f"Took {time.time() - start} seconds")
loc_info 

Took 0.0008561611175537109 seconds


{'administrative_area_level_1': 'Quảng Nam',
 'administrative_area_level_2': nan,
 'administrative_area_level_3': nan,
 'administrative_area_level_4': nan,
 'street_number': '102',
 'route': 'Đường Bạch Đằng',
 'neighborhood': nan,
 'locality': 'Thành phố Hội An',
 'country': 'Vietnam',
 'postal_code': nan,
 'postal_code_suffix': nan,
 'plus_code': nan}

In [None]:
key = list(geocode_results.keys())[1000]
start = time.time()
# First, parse the relevant components into a temporary dictionary for quick lookups
parsed_components = {
    comp['types'][0]: comp['long_name']
    for comp in geocode_results[key][0]['address_components']
    if comp['types'][0] in loc_tags
}

# Now, build the final loc_info dictionary using the loc_tags as keys
# and getting values from parsed_components, defaulting to np.nan if not found
loc_info = {
    tag: parsed_components.get(tag, np.nan)
    for tag in loc_tags
}
print(f"Took {time.time() - start} seconds")
loc_info

TypeError: 'float' object is not iterable

In [56]:
# First, parse the relevant components into a temporary dictionary for quick lookups
parsed_components = {}
for comp in geocode_results[key][0]['address_components']:
    component_type = comp['types'][0]
    if component_type in loc_tags:
        long_name_value = comp['long_name']
        
        # Check if the value is not None and convert it to string
        if long_name_value is not None:
            # Use str() to convert integers, floats, etc., to strings
            parsed_components[component_type] = anyascii(str(long_name_value))
        else:
            # If it's None, you might want to keep it as None or treat it as NaN
            parsed_components[component_type] = np.nan # Or None, depending on preference

# Now, build the final loc_info dictionary
loc_info = {
    tag: parsed_components.get(tag, np.nan)
    for tag in loc_tags
}

loc_info

{'administrative_area_level_1': 'Quang Nam',
 'administrative_area_level_2': nan,
 'administrative_area_level_3': nan,
 'administrative_area_level_4': nan,
 'street_number': '102',
 'route': 'Duong Bach Dang',
 'neighborhood': nan,
 'locality': 'Thanh pho Hoi An',
 'country': 'Vietnam',
 'postal_code': nan,
 'postal_code_suffix': nan,
 'plus_code': nan}

In [53]:
types = set()
for key in geocode_results.keys():
    for comp in geocode_results[key][0]['address_components']:
        types.update([comp['types'][0]])
# types

# Plotting

In [None]:
# try different dbscan model
metric = 'haversine'
min_cluster_size = 8
_, df['cluster_new'] = cluster_data(df[['latitude','longitude']], metric, min_cluster_size)

prev_len = df['cluster_new'].nunique()
df['new_reduced_clusters'] = reduce_clusters(df=df)
print(f"{len(df[df['cluster_new']==-3])} points labelled as transit (-3)")
print(f"reduced clusters by {prev_len - df['cluster_new'].nunique()} from {prev_len} to {df['cluster_new'].nunique()}")

7921 points labelled as transit (-3)
reduced clusters by 408 from 1480 to 1072


In [9]:
plot_col = 'cluster_label' # cluster_label cluster_new
oneday = df[df.date.isin(list(df.date.unique())[-60:-20])] # -110:-90 , -60:-40
oneday = oneday[oneday[plot_col] > -1]
oneday.loc[:,'color'] = 'red'

fig = make_subplots(rows=1, cols=2) 
fig = px.scatter_geo(oneday, 
                     lat="latitude",
                     lon='longitude',
                     color=plot_col,
                     scope='world',
                     fitbounds='locations',
                     hover_data=['direction_similarity']
                    )
bounds_add = 10
fig.add_traces(Scattergeo(lat=[oneday.latitude.min()-bounds_add, oneday.latitude.max()+bounds_add],
                                               lon=[oneday.longitude.min()-bounds_add, oneday.longitude.max()+bounds_add],
                                               mode = 'markers', marker = dict(size = 2,color = 'rgba(0, 0, 0, 0)')))
fig.update_geos(resolution=50)


fig.show()

In [158]:
plotdf = pd.read_csv(STAGEDDATAPATH + 'tile_data_john.csv')
plotdf['cluster_label_norm'] = plotdf['cluster_label'].map(cluster_map)

In [159]:
plotdf=plotdf.groupby('cluster_label_norm')[['latitude','longitude']].mean().reset_index()

In [160]:
# plot_col = 'cluster_label' # cluster_label cluster_new
# oneday = df[df.date.isin(list(df.date.unique())[-60:-20])] # -110:-90 , -60:-40
# oneday = oneday[oneday[plot_col] > -1]
# oneday.loc[:,'color'] = 'red'

fig = make_subplots(rows=1, cols=2) 
fig = px.scatter_geo(plotdf, 
                     lat="latitude",
                     lon='longitude',
                     color='cluster_label_norm',
                     scope='world',
                     fitbounds='locations',
                    #  hover_data=['direction_similarity']
                    )
bounds_add = 10
fig.add_traces(Scattergeo(lat=[plotdf.latitude.min()-bounds_add, plotdf.latitude.max()+bounds_add],
                                               lon=[plotdf.longitude.min()-bounds_add, plotdf.longitude.max()+bounds_add],
                                               mode = 'markers', marker = dict(size = 2,color = 'rgba(0, 0, 0, 0)')))
fig.update_geos(resolution=50)


fig.show()

# Folium Heatmap

In [115]:
def create_folium_heatmap(df: pd.DataFrame) -> folium.Map:
    """
    Generates an interactive heatmap using the Folium library from a Pandas DataFrame.
    The map's initial zoom and center are automatically set to fit the data.

    Args:
        df (pd.DataFrame): A Pandas DataFrame expected to contain the following columns:
                           - 'latitude': Numerical column for latitude coordinates.
                           - 'longitude': Numerical column for longitude coordinates.
                           - 'datetime': (Optional, not directly used for heatmap intensity but good for context)
                                         Datetime column.

    Returns:
        folium.Map: A Folium Map object with the heatmap layer added.
                    The map can be saved to an HTML file or displayed in a Jupyter notebook.

    Raises:
        ValueError: If 'latitude' or 'longitude' columns are missing from the DataFrame.
    """
    # Validate required columns
    required_columns = ['latitude', 'longitude']
    if not all(col in df.columns for col in required_columns):
        missing_cols = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"DataFrame must contain the following columns: {required_columns}. Missing: {missing_cols}")

    # Prepare data for heatmap: list of [latitude, longitude] pairs
    # If you wanted to add intensity, you could add a third element [lat, lon, intensity]
    heat_data = [[row['latitude'], row['longitude']] for index, row in df.iterrows()]

    # Calculate the bounds of the data for automatic zooming
    min_lat, max_lat = df['latitude'].min(), df['latitude'].max()
    min_lon, max_lon = df['longitude'].min(), df['longitude'].max()
    bounds = [[min_lat, min_lon], [max_lat, max_lon]]

    # Create a base Folium map, setting the initial view to fit the data bounds
    # The 'fit_bounds' parameter automatically calculates the best center and zoom level
    m = folium.Map(tiles="OpenStreetMap") # Using default tiles, can specify others like 'Stamen Terrain'

    # Add the HeatMap layer to the map
    # You can customize parameters like radius, blur, min_opacity, max_zoom
    HeatMap(heat_data).add_to(m)

    # Fit the map to the calculated bounds after adding the heatmap
    # This ensures the map view is adjusted to show all data points
    m.fit_bounds(bounds)

    return m

In [None]:
create_folium_heatmap(cdf)