### Environment Setup

In [None]:
# Install Necessary packages
!pip install geopy
!pip install googlemaps
!pip install haversine

Collecting haversine
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading haversine-2.9.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.9.0


In [None]:
# import libraries
import json
import pandas as pd
import numpy as np
import time
from datetime import datetime
from dateutil import parser
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from sklearn.cluster import DBSCAN
import googlemaps
from haversine import haversine_vector


## Data Preprocessing

In [None]:
# Load Google Timeline JSON Data
records = []
with open('google-timeline.json') as f:
    raw_data = json.load(f)

    # Parse JSON and get relevant information
    for entry in raw_data:

        start_time = parser.parse(entry["startTime"])
        end_time = parser.parse(entry["endTime"])

        if "activity" in entry:
            event = "activity"
            event_type = entry["activity"]["topCandidate"]["type"]
            start_lat, start_lon = map(float, entry["activity"]["start"][4:].split(","))
            end_lat, end_lon = map(float, entry["activity"]["end"][4:].split(","))
        elif "visit" in entry:
            event = "visit"
            event_type = "visit"
            start_lat, start_lon = map(float, entry["visit"]["topCandidate"]["placeLocation"][4:].split(","))
            end_lat, end_lon = start_lat, start_lon  # Visits have the same start and end

        # Initialize location columns for later
        start_location, end_location = None, None

        records.append({
            "Date": start_time.date(),
            "start_dt": start_time,
            "end_dt": end_time,
            "Duration (minutes)": (end_time - start_time).total_seconds() / 60,  # in minutes
            "Event": event,
            "Event_Type": event_type,
            "start_lat": start_lat,
            "start_long": start_lon,
            "Start_Location": start_location,
            "end_lat": end_lat,
            "end_long": end_lon,
            "End_Location": end_location
        })

# Convert parsed data into dataframe
df = pd.DataFrame(records)

# Ensure datetime conversion
df["start_dt"] = pd.to_datetime(df["start_dt"], errors="coerce", utc=True)
df["end_dt"] = pd.to_datetime(df["end_dt"], errors="coerce", utc=True)

# Now you can create formatted columns
df["Start_Time"] = df["start_dt"].dt.strftime("%Y-%m-%d | %I:%M:%S %p")
df["End_Time"] = df["end_dt"].dt.strftime("%Y-%m-%d | %I:%M:%S %p")

# Reorder DF columns
df = df[["Date", "Start_Time", "End_Time", "Duration (minutes)", "Event", "Event_Type", "Start_Location", "End_Location",
         "start_dt", "end_dt", "start_lat", "start_long",  "end_lat", "end_long"]]

# Write DF to csv
df.to_csv("google-timeline-without-locations.csv", index=False)

print(df.head())
print(df.columns)


         Date                Start_Time                  End_Time  \
0  2025-02-06  2025-02-06 | 03:20:48 PM  2025-02-07 | 10:32:51 PM   
1  2025-02-07  2025-02-07 | 10:32:51 PM  2025-02-07 | 10:44:19 PM   
2  2025-02-07  2025-02-07 | 10:44:19 PM  2025-02-08 | 12:40:56 PM   
3  2025-02-08  2025-02-08 | 12:40:56 PM  2025-02-08 | 01:11:02 PM   
4  2025-02-08  2025-02-08 | 01:11:02 PM  2025-02-08 | 01:14:01 PM   

   Duration (minutes)     Event            Event_Type Start_Location  \
0         1872.050333     visit                 visit           None   
1           11.482967  activity  in passenger vehicle           None   
2          836.600017  activity  in passenger vehicle           None   
3           30.100000  activity  in passenger vehicle           None   
4            2.983333     visit                 visit           None   

  End_Location                         start_dt  \
0         None 2025-02-06 15:20:48.001000+00:00   
1         None 2025-02-07 22:32:51.021000+00:00   

In [None]:
# Read Data without locations file
df = pd.read_csv("data/google-timeline-without-locations.csv")

# Initialize Google Maps client
geolocator = googlemaps.Client(key="<GOOGLE MAPS API KEY>")

# Dictionary to store cached results
location_cache = {}

# Function to get location name from latitude and longitude
def get_location(lat, lon):

    key = (lat, lon)
    if key in location_cache:
        return location_cache[key]  # Use cached result

    time.sleep(0.1)  # Delaying to avoid rate limits
    if pd.notna(lat) and pd.notna(lon):  # Ensure coordinates exist
        try:
          result = geolocator.reverse_geocode((lat, lon))
          if result:
              address = result[0]["formatted_address"]
              location_cache[key] = address  # Store in cache
              return address
        except Exception as e:
            print(f"Error: {e}")
            return None
    return None

# Apply function to fill Start_Location and End_Location
df["Start_Location"] = df.apply(lambda row: get_location(row["start_lat"], row["start_long"]), axis=1)
df["End_Location"] = df.apply(lambda row: get_location(row["end_lat"], row["end_long"]), axis=1)

# Print updated DataFrame
print(df[["Start_Location", "End_Location"]])

# Save df to csv
df.to_csv("google-timeline-with-locations.csv", index=False)


                                        Start_Location  \
0      1600 Emmet St N, Charlottesville, VA 22901, USA   
1      923 E Market St, Charlottesville, VA 22902, USA   
2         44 Swartzel Shop Rd, Staunton, VA 24401, USA   
3      1085 Red Mill Rd, Natural Bridge, VA 24578, USA   
4               15162 Lee Hwy, Buchanan, VA 24066, USA   
..                                                 ...   
885  2402 Smithfield Rd, Charlottesville, VA 22901,...   
886  2402 Smithfield Rd, Charlottesville, VA 22901,...   
887  2402 Smithfield Rd, Charlottesville, VA 22901,...   
888  2402 Smithfield Rd, Charlottesville, VA 22901,...   
889  2402 Smithfield Rd, Charlottesville, VA 22901,...   

                                          End_Location  
0      1600 Emmet St N, Charlottesville, VA 22901, USA  
1        1601 Ricky Dr, Charlottesville, VA 22901, USA  
2                       I-81, Fairfield, VA 24435, USA  
3               15166 Lee Hwy, Buchanan, VA 24066, USA  
4               15

## Data Preprocessing 2

In [None]:
# Read CSV File
df = pd.read_csv("data/google-timeline-with-locations.csv")


In [71]:
# Function to cluster locations considering both start & end points
def cluster_events(df, eps_meters=100, time_threshold="30min"):
    df = df.copy()

    # Ensure datetime conversion
    df["start_dt"] = pd.to_datetime(df["start_dt"], errors="coerce", utc=True)
    df["end_dt"] = pd.to_datetime(df["end_dt"], errors="coerce", utc=True)

    # Convert coordinates to numeric
    df["start_lat"] = pd.to_numeric(df["start_lat"], errors="coerce")
    df["start_long"] = pd.to_numeric(df["start_long"], errors="coerce")
    df["end_lat"] = pd.to_numeric(df["end_lat"], errors="coerce")
    df["end_long"] = pd.to_numeric(df["end_long"], errors="coerce")

    # Drop rows with NaN values in essential columns
    df = df.dropna(subset=["start_lat", "start_long", "end_lat", "end_long", "start_dt"])

    ## --------- 1. Location Clustering (Considering Start & End) ---------
    start_coords = np.radians(df[["start_lat", "start_long"]].values)
    end_coords = np.radians(df[["end_lat", "end_long"]].values)

    # Compute pairwise Haversine distance for both start and end points
    start_dist = haversine_vector(start_coords, start_coords, comb=True) * 1000  # Convert to meters
    end_dist = haversine_vector(end_coords, end_coords, comb=True) * 1000  # Convert to meters

    # Combine start and end distances (weighted sum or max distance)
    combined_dist = np.maximum(start_dist, end_dist)  # Conservative approach: take the max distance

    loc_db = DBSCAN(eps=eps_meters, min_samples=1, metric="precomputed").fit(combined_dist)
    df["location_cluster"] = loc_db.labels_  # Assign spatial clusters

    ## --------- 2. Time Clustering (Sequential Check) ---------
    df = df.sort_values(by=["start_dt"]).reset_index(drop=True)  # Sort by time
    time_threshold_sec = pd.to_timedelta(time_threshold).total_seconds()

    time_cluster = -1
    prev_time = None

    for i in range(len(df)):
        if prev_time is None or (df.loc[i, "start_dt"] - prev_time).total_seconds() > time_threshold_sec:
            time_cluster += 1  # New cluster when time gap is large
        df.loc[i, "time_cluster"] = time_cluster
        prev_time = df.loc[i, "start_dt"]

    return df

clustered_df = cluster_events(df)

# Write data to csv
clustered_df.to_csv("google-timeline-with-locations-clustered.csv", index=False)

In [None]:
# Read data from file
clustered_df = pd.read_csv("data/google-timeline-with-locations-clustered.csv")

In [73]:
# Initialize Google Maps client
geolocator = googlemaps.Client(key="<GOOGLE API KEY>")

# Function to get Place ID using Geocoding API
def get_place_id(address):
    try:
        geocode_result = geolocator.geocode(address)
        if geocode_result:
            return geocode_result[0].get('place_id', None)
        return None
    except GeocoderTimedOut:
        return None

# Function to get business/building name using Places API
def get_business_name(place_id):
    try:
        if place_id:
            place_details = geolocator.place(place_id)
            if 'result' in place_details:
                return place_details['result'].get('name', "Unknown")
        return "Unknown"
    except GeocoderTimedOut:
        return "Timeout"

# Function to get the business name based on start and end locations
def get_location_name(address):
    place_id = get_place_id(address)
    return get_business_name(place_id)

# Apply reverse geocoding to get location labels and business/building names
clustered_df["Start_Location_Name"] = clustered_df["Start_Location"].apply(get_location_name)
clustered_df["End_Location_Name"] = clustered_df["End_Location"].apply(get_location_name)


In [75]:
# Reorder DF columns
clustered_df = clustered_df[["Date", "Start_Time", "End_Time", "Duration (minutes)", "Event", "Event_Type", "Start_Location", "Start_Location_Name", "End_Location",
         "End_Location_Name", "start_dt", "end_dt", "start_lat", "start_long",  "end_lat", "end_long", "location_cluster", "time_cluster"]]

# Save the updated dataframe
clustered_df.to_csv("labeled-google-timeline-with-locations-clustered.csv", index=False)

## Data Analysis