In [1]:
import geopandas as gpd
import pandas as pd

gdf = gpd.read_file('data/coshocton/coshocton_households_all.shp')
cnt = int(len(gdf) * 0.01)
sample = gdf.sample(n=cnt)
sample.to_file('data/coshocton/coshocton_sample.shp')



Read data

In [10]:
import geopandas as gpd
from shapely.geometry import Point

gdf1 = gpd.read_file('data/coshocton/coshocton_sample.shp')
gdf2 = gpd.read_file('data/coshocton/coshocton_households_all.shp')

Model input

In [2]:
from sklearn.neighbors import BallTree
import pickle

k_neighbors = 31
def get_nearest(src_points, candidates, k_neighbors=10):
    """
    Find nearest neighbors for all source points from a set of candidate points
    modified from: https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
    """
    
    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='euclidean')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Return indices and distances
    return indices, distances

in_pts = [(x,y) for x,y in zip(gdf1.geometry.x , gdf1.geometry.y)]
qry_pts =  [(x,y) for x,y in zip(gdf2.geometry.x , gdf2.geometry.y)]
X, X_dis = get_nearest(in_pts, qry_pts, k_neighbors)
pickle.dump(X, open('data/coshocton/coshocton_buff_k' + str(k_neighbors-1) + '.pickle', "wb"))
pickle.dump(X_dis, open('data/coshocton/coshocton_buff_dis_k' + str(k_neighbors-1) + '.pickle', "wb"))

X, X_dis

(array([[27398, 27074, 27396, ..., 27118, 27078, 27391],
        [27345, 27401, 27346, ..., 16440, 16434, 27357],
        [ 4510,  4509,  4511, ...,  4528,  4067,  4529],
        ...,
        [10653, 10652, 10651, ..., 10597, 10602, 10596],
        [12612, 12610, 12611, ..., 12973, 14516, 12963],
        [33571, 33572, 33570, ..., 33485, 33502, 33490]], dtype=int64),
 array([[   0.        ,   57.91848506,   61.55658089, ...,  155.71473354,
          159.3208305 ,  162.32114786],
        [   0.        ,   38.40469573,   41.86747929, ...,  172.34433127,
          192.69196863,  195.2658188 ],
        [   0.        ,   12.26618047,   34.91110445, ...,  645.4459412 ,
          675.59638383,  684.78698004],
        ...,
        [   0.        ,  130.42637498,  148.0035003 , ...,  818.84687802,
          831.15955179,  842.30943259],
        [   0.        ,   42.38315338,   47.0530484 , ..., 1508.91742983,
         1512.94961062, 1517.99917144],
        [   0.        ,   18.20265908,   43.950

Comparative analysis with lwized location swapping

(1) Expected distance displaced

In [3]:
def rp_edd(buffer_dis):
    edd_rp = []
    for row in buffer_dis:
        edd_rp.append(np.max(row) / 2)
    return sum(edd_rp) / len(edd_rp)

def lw_edd(buffer_dis):
    edd_lw = []
    for row in buffer_dis:
        edd_lw.append(np.mean(row))
    return sum(edd_lw) / len(edd_lw)

def gm_edd(prob, buffer_dis):
    edd_gm = []
    for idx, row in prob.iterrows():
        edd_gm.append(np.dot(buffer_dis[idx], row.tolist()))
    return sum(edd_gm) / len(edd_gm)

In [4]:
import pandas as pd
import pickle
import numpy as np

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]

with open('data/coshocton/sols/coshocton_edd_all.csv', 'w') as fw:
    fw.write('method,eps,k,edd\n')
    fw.flush()

    for k_neighbors in k_neighbors_all:
        buffer_dis = pickle.load(open('data/coshocton/coshocton_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)

        # rp
        edd_rp = rp_edd(buffer_dis)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(edd_rp) + '\n')
        print('edd_rp:', edd_rp)

        # lw
        edd_lw = lw_edd(buffer_dis)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(edd_lw) + '\n')
        print('edd_lw:', edd_lw)
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/coshocton/sols/coshocton_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            edd_gm = gm_edd(prob, buffer_dis)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(edd_gm) + '\n')
            print('edd_gm:', edd_gm)

edd_rp: 206.5233586328766
edd_lw: 263.5486984618922
edd_gm: 66.32649746050855
edd_gm: 93.10535301947102
edd_gm: 179.95640918100807
edd_gm: 248.20422425604116
edd_rp: 314.2902021650227
edd_lw: 401.32246384663665
edd_gm: 58.70512628146511
edd_gm: 103.07682546981354
edd_gm: 259.3767437762736
edd_gm: 327.22946785066245
edd_rp: 387.08124089057895
edd_lw: 504.94912204808287
edd_gm: 49.67586910561741
edd_gm: 110.12731102855918
edd_gm: 325.21043940609826
edd_gm: 373.32254964125315


(2) Average nearest neighbors 

In [5]:
from pointpats import PointPattern
import random
import numpy as np

def origin_ann(gdf):
    points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
    pp = PointPattern(points)
    return pp.mean_nnd


def displace_point(point, max_dist):
    angle = np.random.uniform(0, 2 * np.pi)
    distance = np.random.uniform(0, max_dist)
    new_x = point.x + distance * np.cos(angle)
    new_y = point.y + distance * np.sin(angle)
    return Point(new_x, new_y)

def rp_ann(gdf1, buffer_dis, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for idx, row in gdf1.iterrows():
            point = row['geometry']
            r = np.max(buffer_dis[idx])
            x = displace_point(point, r)
            masked_locs.append(x)
        gdf = gpd.GeoDataFrame(geometry=masked_locs)
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin


def lw_ann(gdf2, buffer, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for locs in buffer:
            locs = np.delete(locs, 0)
            x = random.choice(locs)
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin


def gm_ann(gdf2, prob, buffer, ann_origin, T=100):
    ann = []
    for t in range(T):
        masked_locs = []
        for idx, row in prob.iterrows():
            locs = buffer[idx]
            locs = np.delete(locs, 0)
            x = random.choices(locs, weights=tuple(row.tolist()), k=1)[0]
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        pp = PointPattern(points)
        ann.append(pp.mean_nnd)
    return sum(ann) / len(ann) - ann_origin

In [6]:
import pandas as pd
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ann_origin = origin_ann(gdf1)
print('ann_origin:', ann_origin)

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]
T = 100

with open('data/coshocton/sols/coshocton_ann_all.csv', 'w') as fw:
    fw.write('method,eps,k,ann\n')
    fw.write('origin,,' + str(ann_origin) + '\n')
    fw.flush()
    
    for k_neighbors in k_neighbors_all:
        buffer = pickle.load(open('data/coshocton/coshocton_buff_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = pickle.load(open('data/coshocton/coshocton_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)
        
        # rp
        ann_rp = rp_ann(gdf1, buffer_dis, ann_origin, T)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(ann_rp) + '\n')
        print('ann_rp:', ann_rp)

        # lw
        ann_lw = lw_ann(gdf2, buffer, ann_origin, T)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(ann_lw) + '\n')
        print('ann_lw:', ann_lw)
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/coshocton/sols/coshocton_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            ann_gm = gm_ann(gdf2, prob, buffer, ann_origin, T)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(ann_gm) + '\n')
            print('ann_gm:', ann_gm)


ann_origin: 1322.669063757506
ann_rp: 10.832201200338432
ann_lw: -12.117197866118204
ann_gm: -21.81300681706807
ann_gm: -7.530490771404402
ann_gm: -24.073274622052168
ann_gm: -18.861230608462165
ann_rp: 19.9293583511801
ann_lw: -24.527032274247404
ann_gm: -32.94461693889207
ann_gm: -8.415925226970785
ann_gm: -32.662141409559126
ann_gm: -34.202120989113155
ann_rp: 28.674887346257037
ann_lw: -26.03476990543936
ann_gm: -94.0693892811671
ann_gm: -12.749812166500305
ann_gm: -42.78176428727875
ann_gm: -49.1775066790492


(3) Cluster detection

In [7]:
from sklearn.cluster import DBSCAN
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

X = np.array(np.stack([gdf1.geometry.x, gdf1.geometry.y], axis=1))
db = DBSCAN(eps=1000, min_samples=30).fit(X)
labels = db.labels_
gdf = gdf1.copy()
gdf['db_origin'] = labels
gdf.to_file('data/coshocton/coshocton_sample_db.shp')

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 1
Estimated number of noise points: 311


In [11]:
import random
from sklearn import metrics
from sklearn.cluster import DBSCAN
import numpy as np

def displace_point(point, max_dist):
    angle = np.random.uniform(0, 2 * np.pi)
    distance = np.random.uniform(0, max_dist)
    new_x = point.x + distance * np.cos(angle)
    new_y = point.y + distance * np.sin(angle)
    return Point(new_x, new_y)

def rp_db(gdf1, buffer_dis, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        for idx, row in gdf1.iterrows():
            point = row['geometry']
            r = np.max(buffer_dis[idx])
            x = displace_point(point, r)
            masked_locs.append(x)
        gdf = gpd.GeoDataFrame(geometry=masked_locs)
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels, preds_binary, average='weighted')
        recall = metrics.recall_score(labels, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)

    
def lw_db(gdf2, buffer, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        for locs in buffer:
            locs = np.delete(locs, 0)
            x = random.choice(locs)
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels, preds_binary, average='weighted')
        recall = metrics.recall_score(labels, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)


def gm_db(gdf2, prob, buffer, labels, T=100):
    precision_all, recall_all, f1_score_all = [], [], []
    for t in range(T):
        masked_locs = []
        labels_all = []
        for idx, row in prob.iterrows():
            labels_all.append(labels[idx])
            locs = buffer[idx]
            locs = np.delete(locs, 0)
            x = random.choices(locs, weights=tuple(row.tolist()), k=1)[0]
            masked_locs.append(x)
        gdf = gdf2.iloc[masked_locs]
        points = np.array(np.stack([gdf.geometry.x, gdf.geometry.y], axis=1))
        db = DBSCAN(eps=500, min_samples=5).fit(points)
        preds = db.labels_
        preds_binary = [0 if i == -1 else 1 for i in preds]

        precision = metrics.precision_score(labels_all, preds_binary, average='weighted')
        recall = metrics.recall_score(labels_all, preds_binary, average='weighted')
        f1_score = metrics.f1_score(labels_all, preds_binary, average='weighted')
        precision_all.append(precision)
        recall_all.append(recall)
        f1_score_all.append(f1_score)
    return sum(precision_all) / len(precision_all), sum(recall_all) / len(recall_all), sum(f1_score_all) / len(f1_score_all)

In [12]:
import pandas as pd
import pickle
from sklearn.cluster import DBSCAN
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

X = np.array(np.stack([gdf1.geometry.x, gdf1.geometry.y], axis=1))
db = DBSCAN(eps=500, min_samples=5).fit(X)
labels = db.labels_
labels_binary = [0 if i == -1 else 1 for i in labels]

k_neighbors_all = [10, 20, 30]
eps_all = [0.1, 0.01, 0.001, 0.0001]
T = 100

with open('data/coshocton/sols/coshocton_db_all.csv', 'w') as fw:
    fw.write('method,eps,k,precision,recall,f1\n')
    fw.flush()
    
    for k_neighbors in k_neighbors_all:
        buffer = pickle.load(open('data/coshocton/coshocton_buff_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = pickle.load(open('data/coshocton/coshocton_buff_dis_k' + str(k_neighbors) + '.pickle', "rb"))
        buffer_dis = np.delete(buffer_dis, 0, 1)

        # rp
        db_rp = rp_db(gdf1, buffer_dis, labels_binary, T)
        fw.write('rp,,' + str(k_neighbors) + ',' + str(db_rp[0]) + ',' + str(db_rp[1]) + ',' + str(db_rp[2]) + '\n')
        print('db_rp:', db_rp[0], db_rp[1], db_rp[2])
        
        # lw
        db_lw = lw_db(gdf2, buffer, labels_binary, T)
        fw.write('lw,,' + str(k_neighbors) + ',' + str(db_lw[0]) + ',' + str(db_lw[1]) + ',' + str(db_lw[2]) + '\n')
        print('db_lw:', db_lw[0], db_lw[1], db_lw[2])
        
        # gm
        for eps in eps_all:
            prob = pd.read_csv('data/coshocton/sols/coshocton_prob_eps' + str(eps) + "_k" + str(k_neighbors) + '.csv', header=None, index_col=0)
            db_gm = gm_db(gdf2, prob, buffer, labels_binary, T)
            fw.write('gm,' + str(eps) + ',' + str(k_neighbors) + ',' + str(db_gm[0]) + ',' + str(db_gm[1]) + ',' + str(db_gm[2]) + '\n')
            print('db_gm:', db_gm[0], db_gm[1], db_gm[2])

db_rp: 0.9884320134056394 0.9882633053221285 0.9882461926088046
db_lw: 0.9880769983015676 0.987787114845938 0.9878167698781184
db_gm: 0.9968143162244039 0.9968000000000008 0.9967927850945298
db_gm: 0.9951565062847998 0.9951260504201671 0.9951069723128857
db_gm: 0.990436924966586 0.9901680672268904 0.9902117475690211
db_gm: 0.9868127199033816 0.9864985994397754 0.9865052599546565
db_rp: 0.9848535797558776 0.9846218487394955 0.9846103390290745
db_lw: 0.9826710310975416 0.9823809523809517 0.9823813154230433
db_gm: 0.9961817525769284 0.9961538461538474 0.9961458749899924
db_gm: 0.9898339596853358 0.9897478991596637 0.9896669623957848
db_gm: 0.9834221956071099 0.9832212885154058 0.9831950190072121
db_gm: 0.9812209441163177 0.9809523809523804 0.9809227823955338
db_rp: 0.9791982135214221 0.9788795518207282 0.978856017281703
db_lw: 0.9766364974822404 0.9760504201680666 0.9760859159001087
db_gm: 0.9953602672245492 0.9953198653198654 0.9953054314944795
db_gm: 0.9864024736598406 0.986218487394957