In [1]:
import pandas as pd
import numpy as np
import folium

### Helper functions

In [2]:
""" Categories """
def interval_mon_fri_08_17(row):
    # Monday to Friday from 8:00 to 17:00
    hour_of_day: int = row['timestamp'] % 24
    day: int = int(row['timestamp'] / 24) % 7
    if 8 <= hour_of_day < 17 and day < 5:
        return 1
    else:
        return 0

def interval_mon_fri_17_00(row):
    # Monday to Friday from 17:00 to 00:00
    hour_of_day: int = row['timestamp'] % 24
    day: int = int(row['timestamp'] / 24) % 7
    if 17 <= hour_of_day <= 23 and day < 5:
        return 1
    else:
        return 0
    
def interval_sat_sun_08_00(row):
    # Weekend from 8:00 to 00:00
    hour_of_day: int = row['timestamp'] % 24
    day: int = int(row['timestamp'] / 24) % 7
    if 8 <= hour_of_day <= 23 and day >= 5:
        return 1
    else:
        return 0
    
def interval_night_00_08(row):
    # Night (00:00 to 8:00)
    hour_of_day: int = row['timestamp'] % 24
    if 0 <= hour_of_day < 8:
        return 1
    else:
        return 0
    
def days_from_start(row):
    return int(row['timestamp'] / 24)

def day_and_hour(row):
    hours = round(row['timestamp'])
    days = hours // 24
    remaining_hours = hours % 24
    day_of_week = (days) % 7 
    time = f"{remaining_hours:02d}:00"
    day_mapping = {
        0: "Monday",
        1: "Tuesday",
        2: "Wednesday",
        3: "Thursday",
        4: "Friday",
        5: "Saturday",
        6: "Sunday"
    }
    return day_mapping[day_of_week], time

### Load data

In [3]:
""" Load data """
queries = pd.read_csv('queries.csv', sep=" ")


In [4]:
""" Add columns """
queries['mon_fri_08_17'] = queries.apply(interval_mon_fri_08_17, axis=1)
queries['mon_fri_17_00'] = queries.apply(interval_mon_fri_17_00, axis=1)
queries['sat_sun_08_00'] = queries.apply(interval_sat_sun_08_00, axis=1)
queries['night_00_08'] = queries.apply(interval_night_00_08, axis=1)
queries['days_from_start'] = queries.apply(days_from_start, axis=1)
queries['day_and_hour'] = queries.apply(day_and_hour, axis=1)
queries.head()

Unnamed: 0,ip_address,lat,lon,timestamp,poi_type_query,mon_fri_08_17,mon_fri_17_00,sat_sun_08_00,night_00_08,days_from_start,day_and_hour
0,34.101.177.245,46.532942,6.591174,14.912448,cafeteria,1,0,0,0,0,"(Monday, 15:00)"
1,34.101.177.245,46.532942,6.591174,14.912448,restaurant,1,0,0,0,0,"(Monday, 15:00)"
2,34.101.177.245,46.550342,6.602852,18.024657,restaurant,0,1,0,0,0,"(Monday, 18:00)"
3,34.101.177.245,46.550342,6.602852,18.024657,cafeteria,0,1,0,0,0,"(Monday, 18:00)"
4,34.101.177.245,46.532942,6.591174,36.334539,cafeteria,1,0,0,0,1,"(Tuesday, 12:00)"


### Analysis of User X

In [5]:
# IP address of the user we want to analyze
IP_ADDRESS = "146.71.112.211"

Infer where the user lives and works. 

In [6]:
# Get all queries of user X
user_x = queries[queries["ip_address"]==IP_ADDRESS]
# Group by lat, lon and count the number of unique days, total number of queries, and the number of queries in each category
user_x = user_x.groupby(["lat", "lon"]).agg({"days_from_start": lambda x: len(set(x)), "timestamp": "count", "mon_fri_08_17": "sum", "mon_fri_17_00": "sum", "sat_sun_08_00": "sum", "night_00_08": "sum", "poi_type_query": lambda x: list(set(x))})
user_x = user_x.sort_values(by=["days_from_start"], ascending=False)
user_x = user_x.reset_index()
# Rename columns
user_x = user_x.rename(columns={"lat": "Latitude", "lon":"Longitude", "timestamp": "Total number of queries", "days_from_start": "Number of days", "mon_fri_08_17": "Mon-Fri 08:00 - 17:00", "mon_fri_17_00": "Mon-Fri 17:00 - 00:00", "sat_sun_08_00": "Sat-Sun 08:00 - 00:00", "night_00_08": "Night 00:00 - 08:00", "poi_type_query": "POI types"})
# Latex table
user_x.style.to_latex("latex/user_x_1.tex")
# Show
user_x.head(20)


Unnamed: 0,Latitude,Longitude,Number of days,Total number of queries,Mon-Fri 08:00 - 17:00,Mon-Fri 17:00 - 00:00,Sat-Sun 08:00 - 00:00,Night 00:00 - 08:00,POI types
0,46.530865,6.623209,20,35,0,30,5,0,"[supermarket, club, dojo, gym]"
1,46.535919,6.575488,15,30,30,0,0,0,"[cafeteria, restaurant]"
2,46.529156,6.627356,13,22,0,18,4,0,"[supermarket, club, dojo, gym]"
3,46.532748,6.628007,6,11,0,10,1,0,"[supermarket, club, dojo, gym]"
4,46.509751,6.641607,1,1,0,0,1,0,[club]
5,46.522632,6.564536,1,1,0,0,1,0,[dojo]
6,46.526904,6.562937,1,1,0,0,1,0,[gym]
7,46.564457,6.5668,1,1,0,0,1,0,[gym]
8,46.566729,6.550642,1,1,0,0,1,0,[gym]


Find user's interests based on types of POIs that were queried

In [7]:
# Get all queries of user X
user_x = queries[queries["ip_address"]==IP_ADDRESS]
user_x = user_x.groupby(["poi_type_query"]).agg({"days_from_start": lambda x: len(set(x)), "timestamp": "count", "mon_fri_08_17": "sum", "mon_fri_17_00": "sum", "sat_sun_08_00": "sum", "night_00_08": "sum"})
user_x = user_x.sort_values(by=["days_from_start"], ascending=False)
user_x = user_x.reset_index()
# Rename columns
user_x = user_x.rename(columns={"timestamp": "Total number of queries", "days_from_start": "Number of days", "mon_fri_08_17": "Mon-Fri 08:00 - 17:00", "mon_fri_17_00": "Mon-Fri 17:00 - 00:00", "sat_sun_08_00": "Sat-Sun 08:00 - 00:00", "night_00_08": "Night 00:00 - 08:00"})
# Latex table
user_x.style.to_latex("latex/user_x_2.tex")
# Show
user_x.head(20)



Unnamed: 0,poi_type_query,Number of days,Total number of queries,Mon-Fri 08:00 - 17:00,Mon-Fri 17:00 - 00:00,Sat-Sun 08:00 - 00:00,Night 00:00 - 08:00
0,supermarket,18,20,0,16,4,0
1,club,17,19,0,14,5,0
2,gym,16,19,0,15,4,0
3,cafeteria,15,15,15,0,0,0
4,restaurant,15,15,15,0,0,0
5,dojo,13,15,0,13,2,0


### Detailed analysis of User X

In [50]:
# get all queries of user X
user_x = queries[queries["ip_address"]==IP_ADDRESS]
# create map
map = folium.Map(location=[user_x['lat'].mean(), user_x['lon'].mean()], zoom_start=13)
# create a dictionary to store data for each location
day_and_hour_dict = {}
for index, row in user_x.iterrows():
    location = (row['lat'], row['lon'])
    day_and_hour = row['day_and_hour']
    poi_type_query = row['poi_type_query']

    day = day_and_hour[0]
    hour = day_and_hour[1]

    if location not in day_and_hour_dict:
        day_and_hour_dict[location] = []
    popup = "{}, {} - {}".format(day, hour, poi_type_query)
    day_and_hour_dict[location].append(popup)

# add markers to the map using the timestamps in the dictionary
for location, popups in day_and_hour_dict.items():
    iframe = folium.IFrame('<br>'.join(popups))
    popup = folium.Popup(iframe,
                     min_width=300,
                     max_width=300)
    marker = folium.Marker(location=location,
                       popup=popup).add_to(map)
# view map
map

### Defence: Spatial obfuscation (Geo-indistinguishability)

In [51]:
""" Helper functions """
def get_epsilon_star(p_epsilon_star):
    return np.log(1/p_epsilon_star - 1)

def get_epsilon(epsilon_star, r_star):
    return epsilon_star/r_star

def get_r_bar(epsilon):
    # average loss (utility)
    return 2/epsilon

def get_r_95(r_bar):
    return 2.37*r_bar

def add_laplace_noise(x, epsilon):
    return x + np.random.laplace(loc=0, scale=epsilon)

In [52]:
# lower bound of the adversary error
p_epsilon_star = 0.4
# radius of geo-indistinguishability in meters
r_star = 200

In [56]:
epsilon_star = get_epsilon_star(p_epsilon_star)
epsilon = get_epsilon(epsilon_star, r_star)
r_bar = get_r_bar(epsilon)
r_95 = get_r_95(r_bar)

print("epsilon_star = {}".format(round(epsilon_star, 5)))
print("epsilon = {}".format(round(epsilon, 5)))
print("r_bar = {} meters".format(round(r_bar)))
print("r_95 = {} meters".format(round(r_95)))

epsilon_star = 0.40547
epsilon = 0.00203
r_bar = 987 meters
r_95 = 2338 meters


In [54]:
map_defense = folium.Map(location=[user_x['lat'].mean(), user_x['lon'].mean()], zoom_start=13)
for index, row in user_x.iterrows():
    # get user's location
    location = (row['lat'], row['lon'])
    # add noise to the location
    noisy_location = (add_laplace_noise(location[0], epsilon), add_laplace_noise(location[1], epsilon))
    # add marker to the map
    folium.Marker(location=noisy_location, popup=row['poi_type_query']).add_to(map_defense)
# view map
map_defense