In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from grid import location_to_cell_id
from math import ceil



## 0. Read Dataframes

In [None]:
pois_df = pd.read_csv('pois.csv', sep=' ')
queries_df = pd.read_csv('queries.csv', sep=' ')

print(pois_df)
print(queries_df)

## 1. Data cleaning

In [None]:
#Extract cell_id from which query happened
def get_cell_id(row):
    return location_to_cell_id(row['lat'], row['lon'])

queries_df['cell_id'] = queries_df.apply(lambda row: get_cell_id(row), axis=1)
print(queries_df)

In [None]:
#Get day of the query and time
def get_day(row):
    return ceil(row['timestamp'] / 24)

def get_hour_of_day(row):
    return int(row['timestamp'] % 24)

queries_df['day'] = queries_df.apply(lambda row: get_day(row), axis=1)
queries_df['time'] = queries_df.apply(lambda row: get_hour_of_day(row), axis=1)

#Get daytime
def get_daytime(row):
    time = row['time']
    if (time >= 0 and time < 9):
        return '1.Early'
    if (time >= 9 and time < 12):
        return '2.Morning'
    if (time >= 12 and time < 17):
        return '3.Afternoon'
    if (time >= 17 and time < 20):
        return '4.Evening'
    if (time >= 20 and time < 24):
        return '5.Night'
queries_df['daytime'] = queries_df.apply(lambda row: get_daytime(row), axis=1)
queries_df = queries_df.sort_values(by=['ip_address', 'day', 'time'])
print(queries_df)
queries_df.to_csv('queries_extended.csv', index=False)

## 2.Statistics

In [None]:
!rm queries_grouped.csv
queries_grouped = queries_df.groupby(['ip_address', 'daytime', 'day','cell_id'])\
    .size().sort_index(level=[0,2,1])
#queries_grouped = queries_grouped.where(queries_grouped['count'] > 4).dropna()

queries_grouped = queries_grouped.to_frame(name='count').reset_index()
print(queries_grouped)
queries_grouped.to_csv('queries_grouped.csv')

queries_grouped contains, for each ip, for each day and daytime,
the number of queries launched from a certain cell_id.
It could be useful for inferring movement patterns.

In [None]:
!rm max_cell_daytime.csv
queries_grouped_day_removed = queries_grouped.drop(['day'],axis = 1)
max_cell_daytime = queries_grouped_day_removed.groupby(['ip_address','daytime'])\
    .agg(['max'])
max_cell_daytime.to_csv('max_cell_daytime.csv')

max_cell_daytime contains, for each ip, for each daytime, the cell
from which most queries have been made.
It could be useful to individuate home/work cell id in grid.
The assumption here is that users are on average
habitudinary people, such that each day, for each daytime,
the set of cells does not vary too much.

In [None]:
#reset the multi index manually
#i.e just removed first two rows from stats2 and created header

!sed -i -e 1,3d max_cell_daytime.csv
!echo -e "ip_address,daytime,cell_id,count\n$(cat max_cell_daytime.csv)" > max_cell_daytime.csv

max_cell_daytime = pd.read_csv('max_cell_daytime.csv', sep=',', header='infer')

In [None]:
! rm queries_filtered.csv
class Filter:
    def __init__(self, max_cell_daytime):
        self.max_cell_daytime = max_cell_daytime

    def filter_locations(self, user, daytime, cell_id):
        max_cell_daytime = self.max_cell_daytime
        cell = max_cell_daytime.loc[(max_cell_daytime['ip_address'] == user) & (max_cell_daytime['daytime'] == daytime)]['cell_id'].to_numpy()[0]
        if cell_id != cell:
            return False
        else:
            return True
custom_filter = Filter(max_cell_daytime)
queries_df['filter'] = np.vectorize(custom_filter.filter_locations)(queries_df['ip_address'], queries_df['daytime'], queries_df['cell_id'])
queries_df_filtered = queries_df[queries_df['filter'] == True].drop(columns=['filter'])\
    .sort_values(by=['ip_address', 'day', 'time', 'daytime'], axis=0)
print(queries_df_filtered)
queries_df_filtered.to_csv('queries_filtered.csv')

filtering queries with locations from cell_id which
turned out to be the most interesting cells, for each

In [None]:
from dataclasses import make_dataclass
pattern = make_dataclass("pattern", [("lat", np.float32), ("lon", np.float32), ('time', int), ("pois",str)])
trace = make_dataclass("trace", [("user", str), ("day", int), ("pattern", pattern)])
traces = []
users = queries_df['ip_address'].drop_duplicates().tolist()
days = [_ for _ in range(1,21)]

for user in users:
    for day in days:
        serie = queries_df[(queries_df['ip_address'] == user) & (queries_df['day'] == day)][['lat','lon','time','poi_type_query']]
        for row in serie.itertuples():
            pat = pattern(row[1], row[2], row[3], row[4])
            tr = trace(user, day, pat)
            traces.append(tr)

traces_df = pd.DataFrame(traces)
print(traces_df)
traces_df.to_csv('traces.csv', sep=',', header=True, index=False)

not so useful stuff...

## 3. Infer users' co location using trajectory similarities

### 3.1
Here we try to study, for each user, on a daily basis, their
trajectories and how they differ from day to day. For
computing the similarity, Frechet distance is used.

In [None]:
!pip install similaritymeasures
!rm -r ./daily_trajectories
!mkdir ./daily_trajectories

from similaritymeasures import frechet_dist
import matplotlib.pyplot as plt
plt.ioff()
from sklearn.cluster import DBSCAN

users = queries_df['ip_address'].drop_duplicates().tolist()
days = [_ for _ in range(1,21)]
user_trajectories_per_day = []
user_trajectories_cluster = []

def compute_distance_matrix(trajectories):
    n = len(trajectories)
    dist_m = np.zeros((n, n))
    for i in range(n - 1):
        p = trajectories[i]
        for j in range(i, n):
            q = trajectories[j]
            dist_m[i, j] = frechet_dist(np.radians(p), np.radians(q))
            dist_m[j, i] = dist_m[i, j]
    return dist_m

def clustering_by_dbscan(distance_matrix, eps=100):
    """
    :param eps: unit m for Frechet distance. Should be converted from radians
    """
    db = DBSCAN(eps=eps, min_samples=1, metric='precomputed').fit(distance_matrix)
    return db.labels_

for user in users:
    max_points = queries_df[(queries_df['ip_address'] == user)].groupby('day').size().agg(['max'])
    max_points = max_points.to_numpy()[0]
    trajectories_per_day = []
    ticks = []
    for day in days:
        serie = queries_df[(queries_df['ip_address'] == user) & (queries_df['day'] == day)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        if len(x) != 0 and len(y) != 0:
            ticks.append(str(day))
            while len(x) < max_points:
                #fill trajectory with last known position
                x.append(x[-1])
                y.append(y[-1])
            assert len(x) == max_points
            assert len(y) == max_points
            trajectory = np.zeros((max_points,2))
            trajectory[:,0] = x
            trajectory[:,1] = y
            trajectories_per_day.append(trajectory)
    dist_m = compute_distance_matrix(trajectories_per_day)
    sns.set(font_scale = 2)
    fig = plt.figure(figsize=(42,21))
    ax = sns.heatmap(dist_m, linewidths=1,
                     linecolor='white',
                     xticklabels=ticks,
                     yticklabels=ticks)
    plt.savefig(f'./daily_trajectories/{user}-matrix.png')
    plt.close()
    user_trajectories_per_day.append(trajectories_per_day)

    labels = clustering_by_dbscan(dist_m)
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    assert n_clusters_ == 1

Results: User trajectories
are almost the same on a daily basis. In order to validate the results,
we used dbscan clustering and checked if the number of clusters
of trajectories per user was 1.
Frechet distance is a good metric.

### 3.2
Here we compare users trajectory to see
if some users have commons patterns. In order to reduce the
number of users compared, we considered one day at time.

### Adversarial Model
The adversary is the service provider.
#### Passive adversary (Honest but curious).
The adversary is in possess of
the logs of all the queries made by users to the service,
containing queries contents and metadata.
#### Global adversary:
the adversary view is the whole network (i.e all grids
covered by the service).
#### Computational power:
the adversary is bounded by polynomial time complexity.
#### Background knowledge:
The adversary has some background knowledge on a subset of users, for example
she knows the mapping between IP address and names.
She has also access to side information on users, for example she
can observe their social network profiles.
#### Adversarial Goal:
Gain knowledge about daily similarities in users trajectories, such that
to identify users through co-locations information.
For example, if the adversary learns that on day 5 user 1, Bob, and user 2,
unknown, had very similar trajectories, and she furthermore knows Bob name and surname
(thus having access to a public profile for this user on a social network), if
Bob shares information about co-location (for example being at lunch
with his friend Alice), then the adversary can infer that user 2 is Alice.
(From https://ieeexplore.ieee.org/document/8228621)

In [None]:
def generate_user_id(user):
    return users.index(user)

queries_df['user_id'] = np.vectorize(generate_user_id)(queries_df['ip_address'])
print(queries_df)
users_trajectories = []
cells = queries_df['cell_id'].drop_duplicates().tolist()
data = []
for day in days:
    #find longest trajectory
    max_points = queries_df[queries_df['day'] == day]\
        .groupby('ip_address').size().agg(['max'])
    max_points = max_points.to_numpy()[0]
    trajectories_per_user = []
    ticks = []
    for user in users:
        serie = queries_df[
            (queries_df['ip_address'] == user)
            &
            (queries_df['day'] == day)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        if len(x) > max_points/2:
            ticks.append(str(users.index(user))) #user id
            while len(x) < max_points:
                #fill trajectory with last known position
                x.append(x[-1])
                y.append(y[-1])
            assert len(x) == max_points
            assert len(y) == max_points
            trajectory = np.zeros((max_points,2))
            trajectory[:,0] = x
            trajectory[:,1] = y
            trajectories_per_user.append(trajectory)
    if len(trajectories_per_user) >= 2:
        dist_m = compute_distance_matrix(trajectories_per_user)
        km = 0.1 #100 meter difference between trajectories
        km_per_radians = 6731.0
        labels = clustering_by_dbscan(dist_m,eps=km/km_per_radians)
        for i in range(0,len(labels)):
            for j in range(i + 1, len(labels)-1):
                if labels[i] == labels[j] and labels[i] != -1:
                    print(f'{users[i]} similar to {users[j]} on day {day}')
                    data.append( (users[i], users[j], day))
clusterized_users = pd.DataFrame(data, columns=["user_A", "user_B", "day"])
clusterized_users.to_csv('users_trajectories_similarities.csv')

## 4. Infer Users' Top Locations

What we are trying to do here is: for each user, for each daytime group,
we want to cluster their locations. By doing so, we can identify,
for example, their top locations. Their house would be for
example the centroid of the cluster of locations with most points in daytime group = home.

### Adversarial Model
The adversary is the service provider.
#### Passive adversary (Honest but curious).
The adversary is in possess of
the logs of all the queries made by users to the service, containing queries contents
and metadata.
#### Global adversary:
the adversary view is the whole network (i.e all grids
covered by the service).
#### Computational power:
the adversary is bounded by polynomial time complexity.
#### Background knowledge:
If the adversary has some background knowledge on users, for example
she knows the mapping between IP address and names, an identity linkage attack
is possible.
#### Adversarial Goal:
Infer users' top locations (i.e home, workplace,
frequent places visited during leisure time) to later be able
to re-identify users (identity inference attack).

In [None]:
!rm -r ./location_clusters
!mkdir ./location_clusters

daytime = {}
daytime['home'] = ['1.Early', '5.Night']
daytime['work'] = ['2.Morning', '3.Afternoon']
daytime['leisure'] = ['4.Evening']

data = []
for user in users:
    entry = []
    entry.append(user)
    for dt in daytime.keys():
        for moment in daytime[dt]:
            serie = queries_df[(queries_df['ip_address'] == user) & (queries_df['daytime'] == moment)][['lat','lon']]
        x = []
        y = []
        for row in serie.itertuples():
            x.append(row[1])
            y.append(row[2])
        n = len(x)
        if n > 2:
            coords = np.zeros((n,2))
            coords[:,0] = x
            coords[:,1] = y

            #convert to randians
            coords = np.radians(coords)
            km = 0.01 #10 meter accuracy
            km_per_radians = 6731.0
            db = DBSCAN(eps=km/km_per_radians, min_samples=2, algorithm='ball_tree', metric='haversine').fit(coords)
            fig = plt.figure()
            fig.set_size_inches(10,8)
            plt.scatter(x,y,c=db.labels_)
            plt.savefig(f"./location_clusters/{user}_{dt}.png")
            plt.close()

            clusters_noise = np.array(db.labels_)
            clusters = np.array([c for c in clusters_noise if c >= 0])
            count = np.bincount(clusters)
            top_cluster = clusters[np.argmax(count)]
            x_top = []
            y_top = []
            for i in range(0,len(clusters_noise)):
                if clusters_noise[i] == top_cluster:
                    x_top.append(x[i])
                    y_top.append(y[i])

            lat = np.array(x_top).mean()
            lon = np.array(y_top).mean()

            entry.append(lat)
            entry.append(lon)
    data.append(entry)
top_loc = pd.DataFrame(data, columns=
                       ['ip_address', 'home_lat','home_lon','work_lat','work_lon', 'leisure_lan', 'leisure_lon'])
print(top_loc)
top_loc.to_csv('top_locations.csv')