In [28]:
import geopandas as gpd
import pandas as pd

gdf = gpd.read_file('data/athens/athens_households_all.shp')
cnt = int(len(gdf) * 0.01)
sample = gdf.sample(n=cnt)
sample.to_file('data/athens/athens_sample.shp')

Read data

In [10]:
import geopandas as gpd
from shapely.geometry import Point

gdf1 = gpd.read_file('data/athens/athens_sample.shp')
gdf2 = gpd.read_file('data/athens/athens_households_all.shp')

Model input

In [2]:
from sklearn.neighbors import BallTree
import pickle

k_neighbors = 31
def get_nearest(src_points, candidates, k_neighbors=10):
    """
    Find nearest neighbors for all source points from a set of candidate points
    modified from: https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
    """
    
    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='euclidean')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Return indices and distances
    return indices, distances

in_pts = [(x,y) for x,y in zip(gdf1.geometry.x , gdf1.geometry.y)]
qry_pts =  [(x,y) for x,y in zip(gdf2.geometry.x , gdf2.geometry.y)]
X, X_dis = get_nearest(in_pts, qry_pts, k_neighbors)
pickle.dump(X, open('data/athens/athens_buff_k' + str(k_neighbors-1) + '.pickle', "wb"))
pickle.dump(X_dis, open('data/athens/athens_buff_dis_k' + str(k_neighbors-1) + '.pickle', "wb"))

X, X_dis

(array([[38173, 38174, 38175, ..., 38267, 38226, 38259],
        [10161, 10160, 10134, ..., 10036, 10136, 10038],
        [45508, 45507, 45509, ..., 45518, 51693, 51709],
        ...,
        [45264, 45265, 45266, ..., 45383, 53661, 53662],
        [28932, 28930, 28931, ..., 28077, 29015, 28923],
        [52744, 52745, 52746, ..., 52707, 48039, 52501]], dtype=int64),
 array([[0.00000000e+00, 5.30177671e+00, 1.25963889e+01, ...,
         1.17816084e+02, 1.21382277e+02, 1.23457408e+02],
        [0.00000000e+00, 7.60730729e+02, 9.05900725e+02, ...,
         1.20915686e+03, 1.21884277e+03, 1.22533563e+03],
        [0.00000000e+00, 6.51904793e-01, 1.35909445e+00, ...,
         2.25150495e+02, 2.32561223e+02, 2.33339931e+02],
        ...,
        [0.00000000e+00, 4.06265056e+01, 1.21635033e+02, ...,
         9.82553673e+02, 9.85673862e+02, 9.89192531e+02],
        [0.00000000e+00, 2.37940958e+01, 2.64381857e+01, ...,
         1.38149606e+02, 1.42737377e+02, 1.42851579e+02],
        [0.000000

Comparative analysis with lwized location swapping

(1) Expected distance displaced

In [3]:
def rp_edd(buffer_dis):
    edd_rp = []
    for row in buffer_dis:
        edd_rp.append(np.max(row) / 2)
    return sum(edd_rp) / len(edd_rp)

def lw_edd(buffer_dis):
    edd_lw = []
    for row in buffer_dis:
        edd_lw.append(np.mean(row))
    return sum(edd_lw) / len(edd_lw)

def gm_edd(prob, buffer_dis):
    edd_gm = []
    for idx, row in prob.iterrows():
        edd_gm.append(np.dot(buffer_dis[idx], row.tolist()))
    return sum(edd_gm) / len(edd_gm)

In [4]:
import pandas as pd
import pickle
import numpy as np

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]

with open('data/athens/sols/athens_edd_all.csv', 'w') as fw:
    fw.write('method,eps,k,edd\n')
    fw.flush()

    for k_neighbors in k_neighbors_all:
        buffer_dis = pickle.load(open('data/athens/athens_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)

        # rp
        edd_rp = rp_edd(buffer_dis)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(edd_rp) + '\n')
        print('edd_rp:', edd_rp)

        # lw
        edd_lw = lw_edd(buffer_dis)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(edd_lw) + '\n')
        print('edd_lw:', edd_lw)
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/athens/sols/athens_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            edd_gm = gm_edd(prob, buffer_dis)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(edd_gm) + '\n')
            print('edd_gm:', edd_gm)

edd_rp: 158.89296334137825
edd_lw: 199.2766049262771
edd_gm: 56.737655736182866
edd_gm: 85.96661427379287
edd_gm: 144.2798627010939
edd_gm: 190.87555534776905
edd_rp: 256.2277676547883
edd_lw: 314.50318584771634
edd_gm: 50.42225015122305
edd_gm: 98.45900255518676
edd_gm: 206.08671342819423
edd_gm: 266.17520697516204
edd_rp: 327.73370091814206
edd_lw: 407.3932246858109
edd_gm: 43.15888507687288
edd_gm: 106.37908982069176
edd_gm: 261.0029093161786
edd_gm: 300.12092786125265


(2) Average nearest neighbors 

In [5]:
from pointpats import PointPattern
import random
import numpy as np

def origin_ann(gdf):
    points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
    pp = PointPattern(points)
    return pp.mean_nnd


def displace_point(point, max_dist):
    angle = np.random.uniform(0, 2 * np.pi)
    distance = np.random.uniform(0, max_dist)
    new_x = point.x + distance * np.cos(angle)
    new_y = point.y + distance * np.sin(angle)
    return Point(new_x, new_y)

def rp_ann(gdf1, buffer_dis, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for idx, row in gdf1.iterrows():
            point = row['geometry']
            r = np.max(buffer_dis[idx])
            x = displace_point(point, r)
            masked_locs.append(x)
        gdf = gpd.GeoDataFrame(geometry=masked_locs)
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin


def lw_ann(gdf2, buffer, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for locs in buffer:
            locs = np.delete(locs, 0)
            x = random.choice(locs)
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin


def gm_ann(gdf2, prob, buffer, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for idx, row in prob.iterrows():
            locs = buffer[idx]
            locs = np.delete(locs, 0)
            x = random.choices(locs, weights=tuple(row.tolist()), k=1)[0]
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin

In [6]:
import pandas as pd
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ann_origin = origin_ann(gdf1)
print('ann_origin:', ann_origin)

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]
T = 100

with open('data/athens/sols/athens_ann_all.csv', 'w') as fw:
    fw.write('method,eps,k,ann\n')
    fw.write('origin,,' + str(ann_origin) + '\n')
    fw.flush()
    
    for k_neighbors in k_neighbors_all:
        buffer = pickle.load(open('data/athens/athens_buff_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = pickle.load(open('data/athens/athens_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)
        
        # rp
        ann_rp = rp_ann(gdf1, buffer_dis, ann_origin, T)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(ann_rp) + '\n')
        print('ann_rp:', ann_rp)

        # lw
        ann_lw = lw_ann(gdf2, buffer, ann_origin, T)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(ann_lw) + '\n')
        print('ann_lw:', ann_lw)
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/athens/sols/athens_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            ann_gm = gm_ann(gdf2, prob, buffer, ann_origin, T)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(ann_gm) + '\n')
            print('ann_gm:', ann_gm)


ann_origin: 1167.7297504779806
ann_rp: 5.933749469446639
ann_lw: -25.934089358964684
ann_gm: -17.20116407048431
ann_gm: -5.7043514785473235
ann_gm: -26.685245777430055
ann_gm: -30.949900488492176
ann_rp: 12.752462627398245
ann_lw: -46.865773740329814
ann_gm: -81.10288766415465
ann_gm: -8.633459349164468
ann_gm: -54.77013026750319
ann_gm: -66.99885230985478
ann_rp: 24.966414385531607
ann_lw: -52.46828627669379
ann_gm: -163.22992320285186
ann_gm: -11.574822530457595
ann_gm: -67.24107249954977
ann_gm: -84.0296943783851


(3) Cluster detection

In [7]:
from sklearn.cluster import DBSCAN
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

X = np.array(np.stack([gdf1.geometry.x, gdf1.geometry.y], axis=1))
db = DBSCAN(eps=1000, min_samples=30).fit(X)
labels = db.labels_
gdf = gdf1.copy()
gdf['db_origin'] = labels
gdf.to_file('data/athens/athens_sample_db.shp')

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 0
Estimated number of noise points: 569


In [11]:
import random
from sklearn import metrics
from sklearn.cluster import DBSCAN
import numpy as np

def displace_point(point, max_dist):
    angle = np.random.uniform(0, 2 * np.pi)
    distance = np.random.uniform(0, max_dist)
    new_x = point.x + distance * np.cos(angle)
    new_y = point.y + distance * np.sin(angle)
    return Point(new_x, new_y)

def rp_db(gdf1, buffer_dis, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        for idx, row in gdf1.iterrows():
            point = row['geometry']
            r = np.max(buffer_dis[idx])
            x = displace_point(point, r)
            masked_locs.append(x)
        gdf = gpd.GeoDataFrame(geometry=masked_locs)
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels, preds_binary, average='weighted')
        recall = metrics.recall_score(labels, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)

    
def lw_db(gdf2, buffer, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        for locs in buffer:
            locs = np.delete(locs, 0)
            x = random.choice(locs)
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels, preds_binary, average='weighted')
        recall = metrics.recall_score(labels, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)


def gm_db(gdf2, prob, buffer, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        labels_all = []
        for idx, row in prob.iterrows():
            labels_all.append(labels[idx])
            locs = buffer[idx]
            locs = np.delete(locs, 0)
            x = random.choices(locs, weights=tuple(row.tolist()), k=1)[0]
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels_all, preds_binary, average='weighted')
        recall = metrics.recall_score(labels_all, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels_all, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)

In [12]:
import pandas as pd
import pickle
from sklearn.cluster import DBSCAN
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

X = np.array(np.stack([gdf1.geometry.x, gdf1.geometry.y], axis=1))
db = DBSCAN(eps=500, min_samples=5).fit(X)
labels = db.labels_
labels_binary = [0 if i == -1 else 1 for i in labels]

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]
T = 100

with open('data/athens/sols/athens_db_all.csv', 'w') as fw:
    fw.write('method,eps,k,precision,recall,f1\n')
    fw.flush()
    
    for k_neighbors in k_neighbors_all:
        buffer = pickle.load(open('data/athens/athens_buff_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = pickle.load(open('data/athens/athens_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)

        # rp
        db_rp = rp_db(gdf1, buffer_dis, labels_binary, T)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(db_rp[0]) + ',' + str(db_rp[1]) + ',' + str(db_rp[2]) + '\n')
        print('db_rp:', db_rp[0], db_rp[1], db_rp[2])
        
        # lw
        db_lw = lw_db(gdf2, buffer, labels_binary, T)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(db_lw[0]) + ',' + str(db_lw[1]) + ',' + str(db_lw[2]) + '\n')
        print('db_lw:', db_lw[0], db_lw[1], db_lw[2])
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/athens/sols/athens_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            db_gm = gm_db(gdf2, prob, buffer, labels_binary, T)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(db_gm[0]) + ',' + str(db_gm[1]) + ',' + str(db_gm[2]) + '\n')
            print('db_gm:', db_gm[0], db_gm[1], db_gm[2])

db_rp: 0.9858152014547001 0.9856414762741653 0.9856148842220694
db_lw: 0.9812452836080736 0.9808084358523729 0.9808732118606879
db_gm: 0.991335151515118 0.9909010600706719 0.9909861798090509
db_gm: 0.9865288823904906 0.9858699472759234 0.9860204708842955
db_gm: 0.9814811319126174 0.9807381370826007 0.9809141493197495
db_gm: 0.9806271823468543 0.9801054481546572 0.980213665537031
db_rp: 0.9792021785668328 0.9791036906854131 0.9789495913093392
db_lw: 0.9755504809482572 0.9750439367311076 0.9751267498999758
db_gm: 0.9904133256184682 0.9899259259259278 0.9900205099688663
db_gm: 0.9877774539035071 0.9873813708260105 0.987475049273163
db_gm: 0.9808074801803736 0.9798594024604574 0.9800878109906056
db_gm: 0.975977468056443 0.9752372583479791 0.975388281402844
db_rp: 0.9738689106472144 0.9737258347978909 0.9735549741782143
db_lw: 0.9669484769385568 0.9657820738137087 0.966071562287719
db_gm: 0.9893397252366105 0.9887751004016054 0.9888805893616365
db_gm: 0.9818615313819662 0.9813005272407733 0