### 490 Demo of Unicity Calculations on GeoLife
**Topics**: Privacy, Unicity

DOI: 10.1038/srep01376

*Importance*: Calculating num of similar trajectories present in dataset

In [1]:
import pandas as pd 
import numpy as np 
from lib import preprocess, unique, geoSanitize as geo

In [2]:
## Proof: One User Compared to themselves
# One User for a single month
data = geo.filterUserMonthRange('000', '2008-10', '2008-11')
print(data, end='\n\n')
# Only keep 2 decimals of lon/lat location
oneUserMonth = unique.locationPrecision(data, 2)
# Create location ids for each unique location
oneUserMonth['loc_id'] = unique.generateLocationID(oneUserMonth)

      UID   Latitude   Longitude        Date      Time
0     000  39.984702  116.318417  2008-10-23  02:53:04
1     000  39.984683  116.318450  2008-10-23  02:53:10
2     000  39.984686  116.318417  2008-10-23  02:53:15
3     000  39.984688  116.318385  2008-10-23  02:53:20
4     000  39.984655  116.318263  2008-10-23  02:53:25
...   ...        ...         ...         ...       ...
7740  000  39.993796  116.326082  2008-11-23  10:29:48
7741  000  39.993851  116.325955  2008-11-23  10:29:53
7742  000  39.993951  116.325843  2008-11-23  10:29:58
7743  000  39.994113  116.325732  2008-11-23  10:30:03
7744  000  39.994259  116.325632  2008-11-23  10:30:08

[11372 rows x 5 columns]

42 unique location ids of 
11372 total number of records.


In [3]:
# Therefore: With one in a population, they are completely unique
unique.inTheCrowd(5, oneUserMonth, oneUserMonth.groupby('UID'))

---
 Final Uniqueness Rate: 1.0 
---


1.0

In [4]:
## Test: Two Users in a single month
# Two users for a single month
data = geo.filterbyMonthRange(['000', '001'], '2008-10', '2008-11')
print(data, end="\n\n")
# Only keep 2 decimals
twoUsersOneMonth = unique.locationPrecision(data, 2)
# gen loc ids
twoUsersOneMonth['loc_id'] = unique.generateLocationID(twoUsersOneMonth)

       UID   Latitude   Longitude        Date      Time
0      000  39.984702  116.318417  2008-10-23  02:53:04
1      000  39.984683  116.318450  2008-10-23  02:53:10
2      000  39.984686  116.318417  2008-10-23  02:53:15
3      000  39.984688  116.318385  2008-10-23  02:53:20
4      000  39.984655  116.318263  2008-10-23  02:53:25
...    ...        ...         ...         ...       ...
49675  001  39.977135  116.327722  2008-11-21  00:46:50
49676  001  39.977153  116.327669  2008-11-21  00:46:55
49677  001  39.977209  116.327667  2008-11-21  00:47:00
49678  001  39.977177  116.327636  2008-11-21  00:47:05
49679  001  39.977161  116.327597  2008-11-21  00:47:10

[84989 rows x 5 columns]

100 unique location ids of 
84989 total number of records.


In [6]:
u = unique.inTheCrowd(5, twoUsersOneMonth, twoUsersOneMonth.groupby('UID'))
userSimilarity = 100 - (u*100)
print(f'{round(userSimilarity)}% similarity')

## Therefore, User 000 and 001 roughly share 4% of their location history

i:43 	 rate = 0.9996
i:75 	 rate = 0.9992
i:100 	 rate = 0.9988
i:293 	 rate = 0.9984
i:416 	 rate = 0.998
i:498 	 rate = 0.9976
i:640 	 rate = 0.9972
i:732 	 rate = 0.9968
i:755 	 rate = 0.9964
i:1136 	 rate = 0.996
i:1431 	 rate = 0.9956
i:1758 	 rate = 0.9952
i:1840 	 rate = 0.9948
i:1959 	 rate = 0.9944
i:2009 	 rate = 0.994
i:2017 	 rate = 0.9936
i:2038 	 rate = 0.9932
i:2104 	 rate = 0.9928
i:2109 	 rate = 0.9924
i:2337 	 rate = 0.992
---
 Final Uniqueness Rate: 0.992 
---
1% similarity


In [7]:
## Test: All Users within a single month
# 182 Users 
data = geo.filterbyMonthRange(fromDate='2008-10', toDate='2008-11')
print(data, end="\n\n")
# Only keep 2 precion 
allUserMonth = unique.locationPrecision(data, 2)
# gen loc id
allUserMonth['loc_id'] = unique.generateLocationID(allUserMonth)

       UID   Latitude   Longitude        Date      Time
0      000  39.984702  116.318417  2008-10-23  02:53:04
1      000  39.984683  116.318450  2008-10-23  02:53:10
2      000  39.984686  116.318417  2008-10-23  02:53:15
3      000  39.984688  116.318385  2008-10-23  02:53:20
4      000  39.984655  116.318263  2008-10-23  02:53:25
...    ...        ...         ...         ...       ...
25982  179  40.007802  116.319362  2008-11-29  08:15:52
25983  179  40.007780  116.319360  2008-11-29  08:15:54
25984  179  40.007756  116.319362  2008-11-29  08:15:56
25985  179  40.007740  116.319361  2008-11-29  08:15:58
25986  179  40.007722  116.319369  2008-11-29  08:16:00

[2946829 rows x 5 columns]

15744 unique location ids of 
2946829 total number of records.


In [8]:
u = unique.inTheCrowd(5, allUserMonth, allUserMonth.groupby('UID'))
userSimilarity = 100 - (u*100)
print(f'{round(userSimilarity)}% similarity')

## Therefore, All users roughly share 31% of their location history

i:2 	 rate = 0.9996
i:6 	 rate = 0.9992
i:7 	 rate = 0.9988
i:8 	 rate = 0.9984
i:11 	 rate = 0.998
i:21 	 rate = 0.9976
i:22 	 rate = 0.9972
i:27 	 rate = 0.9968
i:30 	 rate = 0.9964
i:35 	 rate = 0.996
i:36 	 rate = 0.9956
i:39 	 rate = 0.9952
i:40 	 rate = 0.9948
i:53 	 rate = 0.9944
i:57 	 rate = 0.994
i:58 	 rate = 0.9936
i:59 	 rate = 0.9932
i:68 	 rate = 0.9928
i:70 	 rate = 0.9924
i:71 	 rate = 0.992
i:77 	 rate = 0.9916
i:80 	 rate = 0.9912
i:81 	 rate = 0.9908
i:85 	 rate = 0.9904
i:87 	 rate = 0.99
i:89 	 rate = 0.9896
i:92 	 rate = 0.9892
i:98 	 rate = 0.9888
i:101 	 rate = 0.9884
i:108 	 rate = 0.988
i:112 	 rate = 0.9876
i:113 	 rate = 0.9872
i:115 	 rate = 0.9868
i:116 	 rate = 0.9864
i:125 	 rate = 0.986
i:138 	 rate = 0.9856
i:144 	 rate = 0.9852
i:145 	 rate = 0.9848
i:153 	 rate = 0.9844
i:154 	 rate = 0.984
i:155 	 rate = 0.9836
i:158 	 rate = 0.9832
i:160 	 rate = 0.9828
i:161 	 rate = 0.9824
i:170 	 rate = 0.982
i:174 	 rate = 0.9816
i:177 	 rate = 0.9812
i:178 	 