## Examining duplication Between Organisations

In our data we often recieve multiple data sources per dataset. unfortunately this leads to duplication of geometries and other data points in the datasets. this notebook looks to investigate identifying these duplications between organisations.

In [4]:
from download_data import download_dataset
from data import get_organisation_summary, get_duplicates_between_orgs
from plot import plot_map
import spatialite
import pandas as pd
import geopandas as gpd
import os
import itertools
import shapely.wkt

pd.set_option("display.max_rows", None)


download the sqlite3 file for the data. this will be used to run queries against

In [5]:
# download dataset
# dataset = 'article-4-direction-area'
# collection = 'article-4-direction-collection'
# data_dir = os.path.join('../data/entity_resolution',dataset)
# dataset_path = os.path.join(data_dir,f'{dataset}.sqlite3')

dataset = 'conservation-area'
collection = 'conservation-area-collection'
data_dir = os.path.join('../data/entity_resolution',dataset)
dataset_path = os.path.join(data_dir,f'{dataset}.sqlite3')

In [3]:
# download_dataset(dataset,collection,data_dir)

Get a list of organisations. We make an assumption that organisations don't provide us with duplicates this allows us to cut down on the size of the join and hopefully speed up the query

In [6]:
# Identify organisations in dataset and the number of entities they have
org_results = get_organisation_summary(dataset_path)
org_results

Unnamed: 0,organisation_entity,entity_count
0,100,3
1,109,31
2,113,31
3,115,4
4,126,24
5,129,24
6,132,11
7,143,1
8,145,14
9,150,14


In [7]:
# get organisation list from organisations
orgs=list(org_results['organisation_entity'])

# get list of all all combinations
combinations = list(itertools.combinations(orgs,2))

Now run the query (it can be found in the data.py file) This will help highlight the duplicates in the dataset. be warned this is an extremely long query and we should investigate ways of optimising this. This took atleast an hour to run for conservation area, just comparing one organisation with all the others!

In [23]:
# run comparison either on all organisations or two specifically

#  all orgs
# results = None
# for org1,org2 in combinations:
#     dupes = get_duplicates_between_orgs(dataset_path,org1,org2)
#     if len(dupes) > 0:
#         if results:
#             results.append(dupes,ignore_index = True)
#         else:
#             results = dupes

#  one org with all of the other orgs
results = None
org1 = '16'
for org2 in [org for org in orgs if org != org1]:
    print(f' starting {org1} with {org2}')
    dupes = get_duplicates_between_orgs(dataset_path,org1,org2)
    if len(dupes) > 0:
        if results is not None:
            results = pd.concat([results,dupes])
        else:
            results = dupes

# on two specifically this is more useful given how long the query takes
# results = get_duplicates_between_orgs(dataset_path,'16','109')
# results

 starting 16 with 100
 starting 16 with 109
 starting 16 with 113
 starting 16 with 115
 starting 16 with 126
 starting 16 with 129
 starting 16 with 132
 starting 16 with 143
 starting 16 with 145
 starting 16 with 150
 starting 16 with 156
 starting 16 with 163
 starting 16 with 167
 starting 16 with 169
 starting 16 with 170
 starting 16 with 174
 starting 16 with 175
 starting 16 with 181
 starting 16 with 182
 starting 16 with 188
 starting 16 with 192
 starting 16 with 193
 starting 16 with 198
 starting 16 with 203
 starting 16 with 206
 starting 16 with 216
 starting 16 with 217
 starting 16 with 222
 starting 16 with 226
 starting 16 with 228
 starting 16 with 237
 starting 16 with 249
 starting 16 with 258
 starting 16 with 261
 starting 16 with 266
 starting 16 with 268
 starting 16 with 286
 starting 16 with 294
 starting 16 with 295
 starting 16 with 3
 starting 16 with 308
 starting 16 with 309
 starting 16 with 318
 starting 16 with 319
 starting 16 with 329
 starting 16

Save the results to the csv, they can then be read back in later to avoid re-running the query

In [25]:
results.to_csv(os.path.join(data_dir,f'{dataset}-duplicates.csv'))

In [26]:
results

Unnamed: 0,primary_entity,primary_name,primary_reference,primary_organisation_entity,primary_geometry,secondary_entity,secondary_name,secondary_reference,secondary_organisation_entity,secondary_geometry,pct_overlap
0,44006925,"Church Road, Upper Norwood",1400,16,"MULTIPOLYGON (((-0.084363 51.416730,-0.084636 ...",44008881,,COA00000374,100,"MULTIPOLYGON (((-0.086626 51.408744,-0.086612 ...",98.097595
1,44006929,Harold Road,1401,16,"MULTIPOLYGON (((-0.089014 51.419675,-0.089235 ...",44008883,,COA00000370,100,"MULTIPOLYGON (((-0.091365 51.418106,-0.091405 ...",98.53343
0,44000579,Doncaster - High Street,1053,16,"MULTIPOLYGON (((-1.134828 53.525014,-1.134890 ...",44008374,High Street (Doncaster),42,109,"MULTIPOLYGON (((-1.134828 53.525014,-1.134890 ...",99.992917
1,44000580,Stainton,1077,16,"MULTIPOLYGON (((-1.166334 53.437461,-1.166427 ...",44008680,Stainton,43,109,"MULTIPOLYGON (((-1.166333 53.437461,-1.166427 ...",99.956106
0,44002341,Canalside,5300,16,"MULTIPOLYGON (((-0.304721 51.539858,-0.304743 ...",44008917,"Canalside, northeast part",COA00000474,115,"MULTIPOLYGON (((-0.260506 51.534323,-0.260663 ...",99.622785
1,44002341,Canalside,5300,16,"MULTIPOLYGON (((-0.304721 51.539858,-0.304743 ...",44008918,"Canalside, northwest part",COA00000475,115,"MULTIPOLYGON (((-0.304721 51.539857,-0.304743 ...",99.637682
2,44002341,Canalside,5300,16,"MULTIPOLYGON (((-0.304721 51.539858,-0.304743 ...",44008919,"Canalside, southeast part",COA00000476,115,"MULTIPOLYGON (((-0.339468 51.502396,-0.339820 ...",99.617159
3,44002341,Canalside,5300,16,"MULTIPOLYGON (((-0.304721 51.539858,-0.304743 ...",44008920,"Canalside, southwest part",COA00000477,115,"MULTIPOLYGON (((-0.364279 51.501533,-0.365218 ...",99.584793
4,44002446,Bulls Bridge,3420,16,"MULTIPOLYGON (((-0.404439 51.502683,-0.406193 ...",44008920,"Canalside, southwest part",COA00000477,115,"MULTIPOLYGON (((-0.364279 51.501533,-0.365218 ...",30.607248
0,44001790,Ewell Village,5227,16,"MULTIPOLYGON (((-0.249491 51.345729,-0.249321 ...",44008661,Ewell Village (Seymour Mews),10,129,"MULTIPOLYGON (((-0.245868 51.346868,-0.245872 ...",38.404125


plot duplicates against one another. this was put together very quickly

In [21]:
def plot_map(gdf:gpd.GeoDataFrame):
    if type(gdf) != gpd.GeoDataFrame:
        logging.error('input is not a GeodataFrame')        

    # take the point co-ordinates from the same as above
    base = gdf.explore()

    return base

In [29]:
e1 = results[['primary_entity','primary_geometry']]
e1.columns = ['entity','geometry']
e1 = e1.rename_axis('dup_id').reset_index()
e2 = results[['secondary_entity','secondary_geometry']]
e2.columns = ['entity','geometry']
e2 = e2.rename_axis('dup_id').reset_index()
e = pd.concat([e1,e2])
geometry = [shapely.wkt.loads(g) for g in e.geometry]
geometry
gpd_e = gpd.GeoDataFrame(e['entity'],geometry=geometry)
plot_map(gpd_e[gpd_e.index == 1])

In [61]:
type(gpd_e) == gpd.GeoDataFrame

True