**Project 2 Code**
==================

***Places API Clustering*** (No Visualisation)

In [None]:
import json
import pandas as pd
import numpy as np
from dateutil import parser
from haversine import haversine
from sklearn.cluster import DBSCAN
import requests
import time
import os
from dotenv import load_dotenv

# Load API key from .env
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Load stops from location-history.json ---
def load_stops(filename):
    with open(filename, 'r') as f:
        data = json.load(f)

    records = []
    for entry in data:
        if "visit" in entry:
            loc = entry["visit"]["topCandidate"]["placeLocation"]
            if loc.startswith("geo:"):
                lat, lng = map(float, loc[4:].split(","))
                try:
                    start = parser.parse(entry["startTime"])
                    end = parser.parse(entry["endTime"])
                    duration = (end - start).total_seconds() / 60
                    if duration >= 10:
                        records.append({"lat": lat, "lng": lng, "duration_min": duration})
                except Exception as e:
                    print(f"Skipping bad entry: {e}")
    return pd.DataFrame(records)

# Cluster using DBSCAN + haversine
def cluster_locations(df, eps_meters=100):
    coords = df[['lat', 'lng']].values
    radians = np.radians(coords)  # Convert to radians for haversine

    eps_km = eps_meters / 1000.0
    kms_per_radian = 6371.0088
    eps = eps_km / kms_per_radian

    db = DBSCAN(eps=eps, min_samples=2, metric='haversine')
    df['cluster'] = db.fit_predict(radians)
    return df[df['cluster'] != -1]  # drop noise

# Call Google Places API
def reverse_geocode(lat, lng):
    try:
        url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
        params = {
            'location': f"{lat},{lng}",
            'radius': 50,
            'key': GOOGLE_API_KEY
        }
        response = requests.get(url, params=params).json()
        if response['status'] == 'OK' and response['results']:
            top = response['results'][0]
            return {
                "name": top.get("name", ""),
                "types": top.get("types", []),
                "vicinity": top.get("vicinity", "")
            }
    except Exception as e:
        print(f"API error for ({lat}, {lng}): {e}")
    return {}

def label_clusters(df):
    centroids = df.groupby('cluster')[['lat', 'lng']].mean().reset_index()
    labels = []

    for _, row in centroids.iterrows():
        lat, lng = row['lat'], row['lng']
        place = reverse_geocode(lat, lng)
        label = {
            "cluster": int(row['cluster']),
            "lat": lat,
            "lng": lng,
            "name": place.get("name", ""),
            "types": place.get("types", []),
            "vicinity": place.get("vicinity", "")
        }
        labels.append(label)
        time.sleep(1)  # avoid rate limiting

    return pd.DataFrame(labels)

if __name__ == '__main__':
    stops_df = load_stops("location-history.json")
    print(f"Loaded {len(stops_df)} valid stop entries")

    clustered_df = cluster_locations(stops_df)
    print(f"Found {clustered_df['cluster'].nunique()} significant location clusters")

    labeled_df = label_clusters(clustered_df)
    labeled_df.to_csv("significant_locations.csv", index=False)

    print(labeled_df[['cluster', 'name', 'types', 'vicinity']])


Loaded 44 valid stop entries
Found 6 significant location clusters
   cluster                     name    types         vicinity
0        0     16581-16473 Hayes Ln  [route]       Woodbridge
1        1  113-105 Observatory Ave  [route]  Charlottesville
2        2         130 Chemistry Dr  [route]       University
3        3     284-294 McCormick Rd  [route]  Charlottesville
4        4    Jefferson Park Avenue  [route]  Charlottesville
5        5          Michigan Avenue  [route]          Chicago


***Places API Clustering*** (Visualisation, Altered Cluster Parameters)

In [None]:
import json
import os
import time
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap, TimestampedGeoJson
from dateutil import parser
from dotenv import load_dotenv
from sklearn.cluster import DBSCAN

# --- Load API Key from .env ---
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# --- Load and parse location-history.json ---
def parse_location_history(filepath):
    with open(filepath, 'r') as f:
        raw_data = json.load(f)

    visits = []
    for entry in raw_data:
        if "visit" in entry and "topCandidate" in entry["visit"]:
            loc = entry["visit"]["topCandidate"].get("placeLocation", "")
            if loc.startswith("geo:"):
                lat, lng = map(float, loc[4:].split(","))
                try:
                    start = parser.parse(entry["startTime"])
                    end = parser.parse(entry["endTime"])
                    duration = (end - start).total_seconds() / 60
                    if duration >= 5:
                        visits.append({
                            "start_time": start,
                            "end_time": end,
                            "duration_min": duration,
                            "lat": lat,
                            "lng": lng,
                            "date": start.date(),
                            "hour": start.hour
                        })
                except:
                    continue
    return pd.DataFrame(visits)

# --- Cluster locations to reduce API calls ---
def cluster_locations(df, eps_meters=15):
    coords = df[["lat", "lng"]].values
    radians = np.radians(coords)  # Convert to radians for haversine

    eps_km = eps_meters / 1000.0
    kms_per_radian = 6371.0088
    eps = eps_km / kms_per_radian

    db = DBSCAN(eps=eps, min_samples=2, metric='haversine')
    df['cluster'] = db.fit_predict(radians)
    return df[df['cluster'] != -1]  # drop noise

# --- Reverse geocode using Google Places API ---
def reverse_geocode(lat, lng):
    try:
        url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
        params = {
            'location': f"{lat},{lng}",
            'radius': 35,
            'key': GOOGLE_API_KEY
        }
        response = requests.get(url, params=params).json()
        if response['status'] == 'OK' and response['results']:
            top = response['results'][0]
            return {
                "name": top.get("name", ""),
                "types": top.get("types", []),
                "vicinity": top.get("vicinity", "")
            }
    except Exception as e:
        print(f"API error for ({lat}, {lng}): {e}")
    return {}

# --- Label clustered locations ---
def label_clusters(df):
    centroids = df.groupby('cluster')[['lat', 'lng']].mean().reset_index()
    labels = []

    for _, row in centroids.iterrows():
        lat, lng = row['lat'], row['lng']
        place = reverse_geocode(lat, lng)
        label = {
            "cluster": int(row['cluster']),
            "centroid_lat": lat,
            "centroid_lng": lng,
            "name": place.get("name", ""),
            "types": place.get("types", []),
            "vicinity": place.get("vicinity", "")
        }
        labels.append(label)
        time.sleep(1)  # avoid rate limiting

    return pd.DataFrame(labels)

# --- Google Directions API route matching ---
def get_directions_route(start_lat, start_lng, end_lat, end_lng):
    try:
        url = "https://maps.googleapis.com/maps/api/directions/json"
        params = {
            "origin": f"{start_lat},{start_lng}",
            "destination": f"{end_lat},{end_lng}",
            "mode": "walking",
            "key": GOOGLE_API_KEY
        }
        response = requests.get(url, params=params).json()
        if response['status'] == 'OK':
            steps = response['routes'][0]['legs'][0]['steps']
            return [(step['start_location']['lat'], step['start_location']['lng']) for step in steps] + \
                   [(steps[-1]['end_location']['lat'], steps[-1]['end_location']['lng'])]
    except:
        pass
    return []

# --- Generate visualizations ---
def generate_visualizations(df, labels_df):
    df = df.merge(labels_df[['cluster', 'name']], on='cluster', how='left')

    # Bar chart: Average time spent per cluster
    avg_time = df.groupby('name')['duration_min'].mean().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(12, 6))
    sns.barplot(x="duration_min", y="name", data=avg_time, palette="Blues_d")
    plt.xlabel("Average Time Spent (minutes)")
    plt.ylabel("Location")
    plt.title("Average Time Spent per Significant Location")
    plt.tight_layout()
    plt.savefig("avg_time_per_cluster.png")
    plt.close()

    # Route Map with cluster-based frequency indicator
    m = folium.Map(location=[df["lat"].mean(), df["lng"].mean()], zoom_start=13)
    cluster_counts = df.groupby('cluster').size().reset_index(name='count')
    cluster_counts = cluster_counts.merge(labels_df, on='cluster')

    for _, row in cluster_counts.iterrows():
        folium.CircleMarker(
            location=(row["centroid_lat"], row["centroid_lng"]),
            radius=5 + row["count"] * 0.5,
            popup=f"{row['name']}<br>Visits: {row['count']}",
            color="red",
            fill=True,
            fill_color="red"
        ).add_to(m)

    sorted_df = df.sort_values(by="start_time")
    coords = sorted_df[["lat", "lng"]].values.tolist()
    for i in range(len(coords) - 1):
        path = get_directions_route(*coords[i], *coords[i + 1])
        if path:
            folium.PolyLine(path, color="blue", weight=3).add_to(m)

    m.save("routes_with_frequencies.html")

    # Time-of-day activity heatmap per cluster
    time_cluster = df.groupby(["name", "hour"])["duration_min"].sum().unstack(fill_value=0)
    plt.figure(figsize=(14, 8))
    sns.heatmap(time_cluster, cmap="YlOrRd", linewidths=.5, linecolor='gray', cbar_kws={'label': 'Minutes'})
    plt.title("Time-of-Day Activity per Significant Location")
    plt.xlabel("Hour of Day")
    plt.ylabel("Location")
    plt.tight_layout()
    plt.savefig("time_of_day_by_location.png")
    plt.close()

# --- Main Execution ---
if __name__ == "__main__":
    df = parse_location_history("location-history.json")
    clustered_df = cluster_locations(df)
    labeled_clusters = label_clusters(clustered_df)
    if labeled_clusters is not None and not labeled_clusters.empty:
        labeled_clusters.to_csv("labeled_clusters.csv", index=False)
        generate_visualizations(clustered_df, labeled_clusters)
        print("Visualizations and labeled clusters saved, including time-of-day and route maps.")
    else:
        print("No labeled clusters to save or visualize.")

✅ Visualizations and labeled clusters saved, including time-of-day and route maps.
