In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from grid import location_to_cell_id
from math import ceil



# Notebook for attacks 1 and 2 of paper

## 0. Read Dataframes

In [2]:
pois_df = pd.read_csv('pois.csv', sep=' ')
queries_df = pd.read_csv('queries.csv', sep=' ')

print(pois_df)
print(queries_df)

      poi_id  cell_id     poi_type        lat       lon
0        307        1          bar  46.504486  6.559631
1        331        1          bar  46.500259  6.554721
2        578        1  supermarket  46.506017  6.551165
3        897        1         club  46.504494  6.553889
4        972        1  supermarket  46.502984  6.550519
...      ...      ...          ...        ...       ...
1055     372      100   restaurant  46.563852  6.648454
1056     649      100          bar  46.564047  6.640250
1057     730      100          gym  46.565961  6.642228
1058     828      100         club  46.569177  6.642401
1059     990      100          bar  46.565290  6.640855

[1060 rows x 5 columns]
           ip_address        lat       lon   timestamp poi_type_query
0      34.101.177.245  46.532942  6.591174   14.912448      cafeteria
1      34.101.177.245  46.532942  6.591174   14.912448     restaurant
2      34.101.177.245  46.550342  6.602852   18.024657     restaurant
3      34.101.177.245  

## 1. Feature Engineering and Extraction

In [3]:
#Extract cell_id from which query happened
def get_cell_id(row):
    return location_to_cell_id(row['lat'], row['lon'])

queries_df['cell_id'] = queries_df.apply(lambda row: get_cell_id(row), axis=1)
print(queries_df)

           ip_address        lat       lon   timestamp poi_type_query  cell_id
0      34.101.177.245  46.532942  6.591174   14.912448      cafeteria       45
1      34.101.177.245  46.532942  6.591174   14.912448     restaurant       45
2      34.101.177.245  46.550342  6.602852   18.024657     restaurant       76
3      34.101.177.245  46.550342  6.602852   18.024657      cafeteria       76
4      34.101.177.245  46.532942  6.591174   36.334539      cafeteria       45
...               ...        ...       ...         ...            ...      ...
20438     11.173.13.2  46.524410  6.625246  449.159554    supermarket       38
20439     11.173.13.2  46.527363  6.628705  453.426750    supermarket       38
20440     11.173.13.2  46.527363  6.628705  453.426750            gym       38
20441     11.173.13.2  46.524410  6.625246  464.420041    supermarket       38
20442     11.173.13.2  46.527363  6.628705  464.420041     restaurant       38

[20443 rows x 6 columns]


In [4]:
#Get day of the query and time
def get_day(row):
    return ceil(row['timestamp'] / 24)

def get_hour_of_day(row):
    return int(row['timestamp'] % 24)

queries_df['day'] = queries_df.apply(lambda row: get_day(row), axis=1)
queries_df['time'] = queries_df.apply(lambda row: get_hour_of_day(row), axis=1)

#Get daytime
def get_daytime(row):
    time = row['time']
    if (time >= 0 and time < 9):
        return '1.Early'
    if (time >= 9 and time < 12):
        return '2.Morning'
    if (time >= 12 and time < 17):
        return '3.Afternoon'
    if (time >= 17 and time < 20):
        return '4.Evening'
    if (time >= 20 and time < 24):
        return '5.Night'
queries_df['daytime'] = queries_df.apply(lambda row: get_daytime(row), axis=1)
queries_df = queries_df.sort_values(by=['ip_address', 'day', 'time'])
print(queries_df)
queries_df.to_csv('queries_extended.csv')

          ip_address        lat       lon   timestamp poi_type_query  cell_id  \
14110    0.98.248.97  46.546740  6.577377   11.516781      cafeteria       63   
14111    0.98.248.97  46.546740  6.577377   11.516781     restaurant       63   
14112    0.98.248.97  46.510700  6.628843   18.995261           dojo       18   
14113    0.98.248.97  46.510700  6.628843   18.995261    supermarket       18   
14114    0.98.248.97  46.546740  6.577377   34.866475      cafeteria       63   
...              ...        ...       ...         ...            ...      ...   
4151   97.138.146.97  46.520518  6.568483  447.896803            gym       22   
4152   97.138.146.97  46.520518  6.568483  465.134875    supermarket       22   
4153   97.138.146.97  46.503018  6.642813  465.134875            bar       10   
4154   97.138.146.97  46.501978  6.645289  465.134875            gym       10   
4155   97.138.146.97  46.521116  6.640636  465.134875           dojo       40   

       day  time      dayti

## 2.Statistics

In [5]:
!rm queries_grouped.csv
queries_grouped = queries_df.groupby(['ip_address', 'daytime', 'day','cell_id'])\
    .size().sort_index(level=[0,2,1])
#queries_grouped = queries_grouped.where(queries_grouped['count'] > 4).dropna()

queries_grouped = queries_grouped.to_frame(name='count').reset_index()
print(queries_grouped)
queries_grouped.to_csv('queries_grouped.csv')

         ip_address      daytime  day  cell_id  count
0       0.98.248.97    2.Morning    1       63      2
1       0.98.248.97    4.Evening    1       18      2
2       0.98.248.97    2.Morning    2       63      2
3       0.98.248.97      5.Night    2       18      4
4       0.98.248.97    2.Morning    3       63      2
...             ...          ...  ...      ...    ...
9672  97.138.146.97  3.Afternoon   19       22      2
9673  97.138.146.97  3.Afternoon   19       53      2
9674  97.138.146.97    2.Morning   20       10      2
9675  97.138.146.97    2.Morning   20       22      1
9676  97.138.146.97    2.Morning   20       40      1

[9677 rows x 5 columns]


queries_grouped contains, for each ip, for each day and daytime,
the number of queries launched from a certain cell_id.
It could be useful for inferring movement patterns.

In [6]:
!rm max_cell_daytime.csv
queries_grouped_day_removed = queries_grouped.drop(['day'],axis = 1)
max_cell_daytime = queries_grouped_day_removed.groupby(['ip_address','daytime'])\
    .agg(['max'])
max_cell_daytime.to_csv('max_cell_daytime.csv')

max_cell_daytime contains, for each ip, for each daytime, the cell
from which most queries have been made.
It could be useful to individuate home/work cell id in grid.
The assumption here is that users are on average
habitudinary people, such that each day, for each daytime,
the set of cells does not vary too much.

In [7]:
#reset the multi index manually
#i.e just removed first two rows from stats2 and created header

!sed -i -e 1,3d max_cell_daytime.csv
!echo -e "ip_address,daytime,cell_id,count\n$(cat max_cell_daytime.csv)" > max_cell_daytime.csv

max_cell_daytime = pd.read_csv('max_cell_daytime.csv', sep=',', header='infer')

In [8]:
! rm queries_filtered.csv
class Filter:
    def __init__(self, max_cell_daytime):
        self.max_cell_daytime = max_cell_daytime

    def filter_locations(self, user, daytime, cell_id):
        max_cell_daytime = self.max_cell_daytime
        cell = max_cell_daytime.loc[(max_cell_daytime['ip_address'] == user) & (max_cell_daytime['daytime'] == daytime)]['cell_id'].to_numpy()[0]
        if cell_id != cell:
            return False
        else:
            return True
custom_filter = Filter(max_cell_daytime)
queries_df['filter'] = np.vectorize(custom_filter.filter_locations)(queries_df['ip_address'], queries_df['daytime'], queries_df['cell_id'])
queries_df_filtered = queries_df[queries_df['filter'] == True].drop(columns=['filter'])\
    .sort_values(by=['ip_address', 'day', 'time', 'daytime'], axis=0)
print(queries_df_filtered)
queries_df_filtered.to_csv('queries_filtered.csv')

          ip_address        lat       lon   timestamp poi_type_query  cell_id  \
14112    0.98.248.97  46.510700  6.628843   18.995261           dojo       18   
14113    0.98.248.97  46.510700  6.628843   18.995261    supermarket       18   
14116    0.98.248.97  46.510700  6.628843   44.419632     restaurant       18   
14117    0.98.248.97  46.510700  6.628843   44.419632           dojo       18   
14118    0.98.248.97  46.513656  6.629130   44.484873    supermarket       18   
...              ...        ...       ...         ...            ...      ...   
4145   97.138.146.97  46.535919  6.575488  420.080602     restaurant       53   
4146   97.138.146.97  46.520518  6.568483  426.834979            gym       22   
4147   97.138.146.97  46.520518  6.568483  426.834979           dojo       22   
4148   97.138.146.97  46.535919  6.575488  444.091082      cafeteria       53   
4149   97.138.146.97  46.535919  6.575488  444.091082     restaurant       53   

       day  time      dayti

filtering queries with locations from cell_id which
turned out to be the most interesting cells, for each

## 3. Infer users' co location using trajectory similarities

### 3.1
Here we try to study, for each user, on a daily basis, their
trajectories and how they differ from day to day. For
computing the similarity, Frechet distance is used.

In [29]:
!pip install similaritymeasures
!rm -r ./daily_trajectories
!mkdir ./daily_trajectories

from similaritymeasures import frechet_dist
import matplotlib.pyplot as plt
plt.ioff()
from sklearn.cluster import DBSCAN

users = queries_df['ip_address'].drop_duplicates().tolist()
days = [_ for _ in range(1,21)]
user_trajectories_per_day = []
user_trajectories_cluster = []

def compute_distance_matrix(trajectories):
    n = len(trajectories)
    dist_m = np.zeros((n, n))
    for i in range(n - 1):
        p = trajectories[i]
        for j in range(i, n):
            q = trajectories[j]
            dist_m[i, j] = frechet_dist(np.radians(p), np.radians(q))
            dist_m[j, i] = dist_m[i, j]
    return dist_m

def clustering_by_dbscan(distance_matrix, eps=100):
    """
    :param eps: unit m for Frechet distance. Should be converted from radians
    """
    db = DBSCAN(eps=eps, min_samples=1, metric='precomputed').fit(distance_matrix)
    return db.labels_

for user in users:
    max_points = queries_df[(queries_df['ip_address'] == user)].groupby('day').size().agg(['max'])
    max_points = max_points.to_numpy()[0]
    trajectories_per_day = []
    ticks = []
    for day in days:
        serie = queries_df[(queries_df['ip_address'] == user) & (queries_df['day'] == day)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        if len(x) != 0 and len(y) != 0:
            ticks.append(str(day))
            while len(x) < max_points:
                #fill trajectory with last known position
                x.append(x[-1])
                y.append(y[-1])
            assert len(x) == max_points
            assert len(y) == max_points
            trajectory = np.zeros((max_points,2))
            trajectory[:,0] = x
            trajectory[:,1] = y
            trajectories_per_day.append(trajectory)

    dist_m = compute_distance_matrix(trajectories_per_day)
    sns.set(font_scale = 2)
    fig = plt.figure(figsize=(42,21))
    ax = sns.heatmap(dist_m, linewidths=1,
                     linecolor='white',
                     xticklabels=ticks,
                     yticklabels=ticks)
    plt.savefig(f'./daily_trajectories/{user}-matrix.png')
    plt.close()

You should consider upgrading via the '/home/intx/PycharmProjects/Privacy_Enhancing_Tech_Project/project2/venv/bin/python -m pip install --upgrade pip' command.[0m


Results: User trajectories
are almost the same on a daily basis. In order to validate the results,
we used dbscan clustering and checked if the number of clusters
of trajectories per user was 1.
Frechet distance is a good metric.

### 3.2 - De Anonimization of trajectories attack
Here we compare users trajectory to see
if some users have commons patterns. In order to reduce the
number of users compared, we considered one day at time.

### Adversarial Model
The adversary is the service provider.
#### Passive adversary (Honest but curious).
The adversary is in possess of
the logs of all the queries made by users to the service,
containing queries contents and metadata.
#### Global adversary:
the adversary view is the whole network (i.e all grids
covered by the service).
#### Computational power:
the adversary is bounded by polynomial time complexity.
#### Background knowledge:
The adversary has some background knowledge on a subset of users, for example
she knows the mapping between IP address and names.
She has also access to side information on users, for example she
can observe their social network profiles.
#### Adversarial Goal:
Gain knowledge about daily similarities in users trajectories, such that
to identify users through co-locations information.
For example, if the adversary learns that on day 5 user 1, Bob, and user 2,
unknown, had very similar trajectories, and she furthermore knows Bob name and surname
(thus having access to a public profile for this user on a social network), if
Bob shares information about co-location (for example being at lunch
with his friend Alice), then the adversary can infer that user 2 is Alice.
(From https://ieeexplore.ieee.org/document/8228621)

In [22]:
def generate_user_id(user):
    return users.index(user)

queries_df['user_id'] = np.vectorize(generate_user_id)(queries_df['ip_address'])
print(queries_df)
users_trajectories = []
cells = queries_df['cell_id'].drop_duplicates().tolist()
data = []
for day in days:
    #find longest trajectory
    max_points = queries_df[queries_df['day'] == day]\
        .groupby('ip_address').size().agg(['max'])
    max_points = max_points.to_numpy()[0]
    trajectories_per_user = []
    ticks = []
    for user in users:
        serie = queries_df[
            (queries_df['ip_address'] == user)
            &
            (queries_df['day'] == day)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        if len(x) > max_points/2:
            ticks.append(str(users.index(user))) #user id
            while len(x) < max_points:
                #fill trajectory with last known position
                x.append(x[-1])
                y.append(y[-1])
            assert len(x) == max_points
            assert len(y) == max_points
            trajectory = np.zeros((max_points,2))
            trajectory[:,0] = x
            trajectory[:,1] = y
            trajectories_per_user.append(trajectory)
    if len(trajectories_per_user) >= 2:
        dist_m = compute_distance_matrix(trajectories_per_user)
        km = 0.1 #100 meter difference between trajectories
        km_per_radians = 6731.0
        labels = clustering_by_dbscan(dist_m,eps=km/km_per_radians)
        for i in range(0,len(labels)):
            for j in range(i + 1, len(labels)-1):
                if labels[i] == labels[j] and labels[i] != -1:
                    print(f'{users[i]} similar to {users[j]} on day {day}')
                    data.append( (users[i], users[j], day))
clusterized_users = pd.DataFrame(data, columns=["user_A", "user_B", "day"])
clusterized_users = clusterized_users.sort_values(by=['user_A','day','user_B'])
clusterized_users.to_csv('users_trajectories_similarities.csv')

          ip_address        lat       lon   timestamp poi_type_query  cell_id  \
14110    0.98.248.97  46.546740  6.577377   11.516781      cafeteria       63   
14111    0.98.248.97  46.546740  6.577377   11.516781     restaurant       63   
14112    0.98.248.97  46.510700  6.628843   18.995261           dojo       18   
14113    0.98.248.97  46.510700  6.628843   18.995261    supermarket       18   
14114    0.98.248.97  46.546740  6.577377   34.866475      cafeteria       63   
...              ...        ...       ...         ...            ...      ...   
4151   97.138.146.97  46.520518  6.568483  447.896803            gym       22   
4152   97.138.146.97  46.520518  6.568483  465.134875    supermarket       22   
4153   97.138.146.97  46.503018  6.642813  465.134875            bar       10   
4154   97.138.146.97  46.501978  6.645289  465.134875            gym       10   
4155   97.138.146.97  46.521116  6.640636  465.134875           dojo       40   

       day  time      dayti

## 4. Infer Users' Top Locations - Identity linkage attack

What we are trying to do here is: for each user, for each daytime group,
we want to cluster their locations. By doing so, we can identify,
for example, their top locations. Their house would be for
example the centroid of the cluster of locations with most points in daytime group = home.

### Adversarial Model
The adversary is the service provider.
#### Passive adversary (Honest but curious).
The adversary is in possess of
the logs of all the queries made by users to the service, containing queries contents
and metadata.
#### Global adversary:
the adversary view is the whole network (i.e all grids
covered by the service).
#### Computational power:
the adversary is bounded by polynomial time complexity.
#### Background knowledge:
If the adversary has some background knowledge on users, for example
she knows the mapping between IP address and names, an identity linkage attack
is possible.
#### Adversarial Goal:
Infer users' top locations (i.e home, workplace,
frequent places visited during leisure time) to later be able
to re-identify users (identity inference attack).

In [24]:
!rm -r ./location_clusters
!mkdir ./location_clusters

daytime = {}
daytime['home'] = ['1.Early', '5.Night']
daytime['work'] = ['2.Morning', '3.Afternoon']
daytime['leisure'] = ['4.Evening']

data = []
for user in users:
    entry = []
    entry.append(user)
    for dt in daytime.keys():
        for moment in daytime[dt]:
            serie = queries_df[(queries_df['ip_address'] == user) & (queries_df['daytime'] == moment)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        n = len(x)
        if n > 2:
            coords = np.zeros((n,2))
            coords[:,0] = x
            coords[:,1] = y

            #convert to randians
            coords = np.radians(coords)
            km = 0.01 #10 meter accuracy
            km_per_radians = 6731.0
            db = DBSCAN(eps=km/km_per_radians, min_samples=2, algorithm='ball_tree', metric='haversine').fit(coords)
            fig = plt.figure()
            fig.set_size_inches(10,8)
            plt.scatter(x,y,c=db.labels_)
            plt.savefig(f"./location_clusters/{user}_{dt}.png")
            plt.close()

            clusters_noise = np.array(db.labels_)
            clusters = np.array([c for c in clusters_noise if c >= 0])
            count = np.bincount(clusters)
            top_cluster = clusters[np.argmax(count)]
            x_top = []
            y_top = []
            for i in range(0,len(clusters_noise)):
                if clusters_noise[i] == top_cluster:
                    x_top.append(x[i])
                    y_top.append(y[i])

            lat = np.array(x_top).mean()
            lon = np.array(y_top).mean()

            entry.append(lat)
            entry.append(lon)
    data.append(entry)
top_loc = pd.DataFrame(data, columns=
                       ['ip_address', 'home_lat','home_lon','work_lat','work_lon', 'leisure_lan', 'leisure_lon'])
print(top_loc)
top_loc.to_csv('top_locations.csv')

          ip_address   home_lat  home_lon   work_lat  work_lon  leisure_lan  \
0        0.98.248.97  46.510700  6.628843  46.510700  6.628843    46.510700   
1      10.229.150.53  46.562529  6.596980  46.546377  6.575353    46.562529   
2      100.255.65.73  46.549880  6.609449  46.527792  6.597571    46.555607   
3    101.193.212.180  46.537596  6.627838  46.535992  6.622526          NaN   
4     103.107.27.105  46.539250  6.629559  46.546377  6.575353    46.539250   
..               ...        ...       ...        ...       ...          ...   
195   94.220.204.193  46.557610  6.563681  46.567488  6.622811    46.557039   
196    94.223.68.192  46.538914  6.604344  46.527792  6.597571    46.536827   
197   95.146.245.203  46.562945  6.584247  46.521844  6.582789    46.562945   
198   96.128.153.246  46.560172  6.648720  46.525781  6.600312    46.557310   
199    97.138.146.97  46.517607  6.560419  46.520518  6.568483    46.520518   

     leisure_lon  
0       6.628843  
1       6.596