# Location Entropy

### Load Required Libraries

In [1]:
import os
import pandas as pd
import numpy as np

### Load EPFL Dataset

In [2]:
def load_epfl_dataset(folder_path):
    all_records = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") and filename != "_cabs.txt":
            user_id = filename.replace(".txt", "")
            file_path = os.path.join(folder_path, filename)

            df = pd.read_csv(
                file_path,
                sep=" ",
                header=None,
                names=["latitude", "longitude", "occupancy", "timestamp"]
            )

            df["user_id"] = user_id
            all_records.append(df)

    if len(all_records) == 0:
        raise ValueError("No .txt files found. Check folder path.")

    return pd.concat(all_records, ignore_index=True)


### Convert GPS Coordinates to Discrete Locations

In [3]:
def gps_to_grid(df, lat_col="latitude", lon_col="longitude", precision=3):
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")

    df = df.dropna(subset=[lat_col, lon_col])

    df["location_id"] = (
        df[lat_col].round(precision).astype(str) + "_" +
        df[lon_col].round(precision).astype(str)
    )

    return df


### Compute Location Entropy

In [4]:
def compute_location_entropy(df):

    location_counts = (
        df.groupby(["user_id", "location_id"])
          .size()
          .reset_index(name="visits")
    )

    total_visits = (
        location_counts.groupby("user_id")["visits"]
        .sum()
        .reset_index(name="total_visits")
    )

    merged = location_counts.merge(total_visits, on="user_id")
    merged["p"] = merged["visits"] / merged["total_visits"]

    entropy = (
        merged.groupby("user_id")["p"]
        .apply(lambda x: -np.sum(x * np.log2(x)))
        .reset_index(name="location_entropy")
    )

    return entropy


### Execute the Full Pipeline

In [13]:
folder_path = r"cabspottingdata"

# os.listdir(folder_path)

In [9]:
folder_path = r"cabspottingdata"

df = load_epfl_dataset(folder_path)
df.shape

(11219955, 5)

In [10]:
df.head()

Unnamed: 0,latitude,longitude,occupancy,timestamp,user_id
0,37.75134,-122.39488,0,1213084687,new_abboip
1,37.75136,-122.39527,0,1213084659,new_abboip
2,37.75199,-122.3946,0,1213084540,new_abboip
3,37.7508,-122.39346,0,1213084489,new_abboip
4,37.75015,-122.39256,0,1213084237,new_abboip


In [11]:
df = gps_to_grid(df, precision=3)
df.head()

Unnamed: 0,latitude,longitude,occupancy,timestamp,user_id,location_id
0,37.75134,-122.39488,0,1213084687,new_abboip,37.751_-122.395
1,37.75136,-122.39527,0,1213084659,new_abboip,37.751_-122.395
2,37.75199,-122.3946,0,1213084540,new_abboip,37.752_-122.395
3,37.7508,-122.39346,0,1213084489,new_abboip,37.751_-122.393
4,37.75015,-122.39256,0,1213084237,new_abboip,37.75_-122.393


In [12]:
entropy_df = compute_location_entropy(df)
entropy_df.head()

Unnamed: 0,user_id,location_entropy
0,new_abboip,10.253027
1,new_abcoij,2.885279
2,new_abdremlu,11.128554
3,new_abgibo,10.851199
4,new_abjoolaw,10.603504


### Insights and Discussion
##### Behavioral Segmentation
Location entropy allows classification of users into:
- Routine-based movers
- Exploratory or high-coverage movers

This segmentation is valuable for both operational optimization and strategic decision-making.

##### Urban Mobility Insights
High-entropy areas often correspond to:
- Commercial districts
- Tourist zones
- Transport hubs

Low-entropy areas may indicate residential or specialized zones.