# Path Crossing
I started "WarDriving" on my commute to work. I just use the Android app from [WiGLE](https://wigle.net/). The thought crossed my mind that I likely "cross paths" with the same people at different locations on different days, so I wanted to analyze the data to find out.

I started off exporting the local database from the WiGLE app and exported the network and location tables as CSVs.

In [None]:
import math
import folium #A mapping library
import pandas as pd
from tqdm.notebook import tqdm
from folium.plugins import MarkerCluster

In [None]:
network_file = "network_202312062229.csv"
location_file = "location_202312062239.csv"

In [None]:
df_network = pd.read_csv(network_file)
df_location = pd.read_csv(location_file)

In [None]:
#Define the columns we don't need
network_columns_to_drop = ["frequency", "capabilities", "type", "bestlevel", "bestlat", "bestlon", "rcois", "mfgrid", "service"]
location_columns_to_drop = ["level", "altitude", "accuracy", "external", "mfgrid"]

In [None]:
network_dropped_df = df_network.drop(columns=network_columns_to_drop)
network_dropped_df.head()

In [None]:
print(f"Total rows in network table: {network_dropped_df.shape[0]}")
unique_bssids_network = network_dropped_df['bssid'].nunique()
print(f"Total unique bssids in network table: {unique_bssids_network}")

In [None]:
location_dropped_df = df_location.drop(columns=location_columns_to_drop)
location_dropped_df.head()

In [None]:
print(f"Total rows in location table: {location_dropped_df.shape[0]}")
unique_bssids_location = location_dropped_df['bssid'].nunique()
print(f"Total unique bssids in location table: {unique_bssids_location}")

# Mapping
We use the folium library to map the data points. We center the map using the mean values of our latitude and longitude data points. Then we create a marker cluster. This makes the map more responsive. Without this step, there are too many data points and the browser will crash.

In [None]:
interactive_map = folium.Map(location=[location_dropped_df['lat'].mean(), location_dropped_df['lon'].mean()], zoom_start=10)

In [None]:
marker_cluster = MarkerCluster().add_to(interactive_map)

In [None]:
for index, row in tqdm(location_dropped_df.iterrows()):
    if index % 4 == 0:  # higher number = less data points, lower number = more data points
        folium.Marker([row['lat'], row['lon']], popup=row['bssid']).add_to(marker_cluster)

In [None]:
interactive_map

### Drop rows with bssids that appear only once.

In [None]:
multi_location_df = location_dropped_df.groupby('bssid').filter(lambda x: len(x) > 1)
print("Number of rows after filtering:", multi_location_df.shape[0])

### Implement Haversine
This function calculates the distance between two coordinates.

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) 

    # Radius of Earth in kilometers. Use 3956 for miles
    r = 6371.0

    # Calculate the result
    distance = r * c

    return distance

In [None]:
# Example usage
lat1, lon1 = 40.7128, -74.0060  # Coordinates of Location 1
lat2, lon2 = 34.0522, -118.2437 # Coordinates of Location 2

distance = haversine(lat1, lon1, lat2, lon2)
print(f"Distance: {distance} kilometers")

Testing with our data.

In [None]:
distance1 = haversine(multi_location_df['lat'][0], multi_location_df['lon'][0], multi_location_df['lat'][100], multi_location_df['lon'][100])
print(distance1)

In [None]:
# Sort the DataFrame by 'bssid'
df_sorted = multi_location_df.sort_values(by='bssid')

### Calculate max distance
The below function will find the bssid with the maximum distance between two measurements. It's commented out because it takes a while to run and was just used for testing.

In [None]:
# Initialize variables for the maximum distance and corresponding bssid
max_distance = 0
max_distance_bssid = None

def find_max_distance(): # wrapping this in function because it takes forever
    # Iterate through the DataFrame, grouped by 'bssid'
    for bssid, group in tqdm(df_sorted.groupby('bssid')):
        # Skip groups with less than 2 entries
        if len(group) < 2:
            continue
    
        # Calculate pairwise distances within each group
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                distance = haversine(group.iloc[i]['lat'], group.iloc[i]['lon'],
                                     group.iloc[j]['lat'], group.iloc[j]['lon'])
                if distance > max_distance:
                    max_distance = distance
                    max_distance_bssid = bssid
# find_max_distance()

In [None]:
# Print the results
#print("Maximum Distance:", max_distance, "kilometers")
#print("BSSID with furthest distance between two entries:", max_distance_bssid)

# Functionize Mapping
This function takes in a dataframe and a bssid and plots everywhere that bssid was seen.

In [None]:
def create_clustered_map(df, bssid):
    # Filter the DataFrame for the given bssid
    bssid_df = df[df['bssid'] == bssid]

    # Check if there are any data points for the given bssid
    if bssid_df.empty:
        print(f"No data points found for bssid: {bssid}")
        return

    # Create a map centered around the first data point
    map_center = [bssid_df.iloc[0]['lat'], bssid_df.iloc[0]['lon']]
    map = folium.Map(location=map_center, zoom_start=12)

    # Create a MarkerCluster object
    marker_cluster = MarkerCluster().add_to(map)

    # Add markers to the cluster
    for index, row in bssid_df.iterrows():
        folium.Marker([row['lat'], row['lon']], popup=row['time']).add_to(marker_cluster)

    # Display the map
    return map

### Get top 100 max distances
I carry a lot of different devices. As a result, the bssids with the most measurements are devices I carry with me. I'm not interested in those, so we find the top 100 distances between two data points of the same ssid. By plotting the first few, it's clear which ones are devices that belong to me and which ones arent.

### Warning: This is computationally expensive it will take a bit.

In [None]:
max_distances = {}  # Dictionary to hold bssid and its maximum distance

for bssid, group in tqdm(df_sorted.groupby('bssid')):
    # Skip groups with less than 2 entries
    if len(group) < 2:
        continue

    # Initialize the max distance for the current bssid
    max_distance_for_bssid = 0

    # Calculate pairwise distances within each group
    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            distance = haversine(group.iloc[i]['lat'], group.iloc[i]['lon'],
                                 group.iloc[j]['lat'], group.iloc[j]['lon'])
            if distance > max_distance_for_bssid:
                max_distance_for_bssid = distance

    # Add the maximum distance for the current bssid to the dictionary
    max_distances[bssid] = max_distance_for_bssid

# Sorting the dictionary by maximum distance and getting the top 100
top_100_bssids = dict(sorted(max_distances.items(), key=lambda item: item[1], reverse=True)[:100])

# Print or return the top 100 bssids
top_100_bssids

### Mapping the results
Now we can pass in the the bssids we found. Once you get through the first few which might be devices you carry with you, you will start to see bssids with only a few data points on the map, but that are far away from each other. This is likely a person you've corssed paths with more than once in different locations. Pretty cool, huh?!

In [None]:
map_output = create_clustered_map(df_sorted, 'xx:xx:xx:xx:xx:xx')
map_output