## Examining duplication Between Organisations

In our data we often recieve multiple data sources per dataset. unfortunately this leads to duplication of geometries and other data points in the datasets. this notebook looks to investigate identifying these duplications between organisations.

In [5]:
from download_data import download_dataset
import spatialite
import pandas as pd
import os


In [6]:
# download dataset
dataset = 'conservation-area'
collection = 'conservation-area-collection'
data_dir = os.path.join('../data/entity_resolution',dataset)
dataset_path = os.path.join(data_dir,f'{dataset}.sqlite3')

In [7]:
download_dataset(dataset,collection,data_dir)

In [8]:
sql = """
        SELECT a.entity AS primary_entity,
            a.name,
            a.reference,
            a.organisation_entity,
            a.geometry,
            b.entity AS secondary_entity,
            b.name,
            b.reference,
            b.organisation_entity,
            b.geometry,
            100 *(ST_Area(ST_Intersection(GeomFromText(a.geometry), GeomFromText(b.geometry)))/ MIN(ST_Area(GeomFromText(a.geometry)), ST_Area(GeomFromText(b.geometry)))) AS pct_overlap
        FROM
            (SELECT entity,
                    name,
                    organisation_entity,
                    reference,
                    geometry
            FROM entity
            WHERE ST_IsValid(geometry)) a
        JOIN
            (SELECT entity,
                    name,
                    organisation_entity,
                    reference,
                    geometry
            FROM entity
            WHERE ST_IsValid(geometry)) b 
        ON a.organisation_entity <> b.organisation_entity
        AND ST_Intersects(GeomFromText(a.geometry), GeomFromText(b.geometry))
        WHERE 100 *(ST_Area(ST_Intersection(GeomFromText(a.geometry), GeomFromText(b.geometry)))/ MIN(ST_Area(GeomFromText(a.geometry)), ST_Area(GeomFromText(b.geometry))));
    """

In [None]:
with spatialite.connect(sqlite_dataset_path) as con:
    cursor = con.execute(sql)
    cols = [column[0] for column in cursor.description]
    results = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

In [49]:
results

Unnamed: 0,primary_entity,name,reference,organisation_entity,geometry,secondary_entity,name.1,reference.1,organisation_entity.1,geometry.1,pct_overlap,area_a,"ST_Intersects(a.geometry, b.geometry)"
0,44000378,Daresbury,8671,16,"MULTIPOLYGON (((-2.632257 53.342071,-2.632754 ...",44007961,Daresbury,1,156,"MULTIPOLYGON (((-2.632272 53.342064,-2.632769 ...",99.424511,0.000014,-1
1,44000379,Hale Road,8672,16,"MULTIPOLYGON (((-2.818042 53.336541,-2.817514 ...",44007962,Hale Road,3,156,"MULTIPOLYGON (((-2.818057 53.336533,-2.817529 ...",99.324468,0.000022,-1
2,44000382,Halton Village,8675,16,"MULTIPOLYGON (((-2.693213 53.335338,-2.692786 ...",44007963,Halton,6,156,"MULTIPOLYGON (((-2.693227 53.335331,-2.692800 ...",99.346062,0.000023,-1
3,44000537,Owston,1073,16,"MULTIPOLYGON (((-1.174240 53.594945,-1.174364 ...",44009146,Owston,1,109,"MULTIPOLYGON (((-1.174240 53.594945,-1.174364 ...",99.999889,0.000058,-1
4,44000538,Skellow - Buttercross,1074,16,"MULTIPOLYGON (((-1.198862 53.588508,-1.199013 ...",44008349,Skellow - Buttercross,3,109,"MULTIPOLYGON (((-1.198862 53.588508,-1.199013 ...",100.000000,0.000007,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,44001069,Battersea Park,3815,16,"MULTIPOLYGON (((-0.165668 51.477382,-0.165442 ...",44009103,Battersea Park,COA00001010,376,"MULTIPOLYGON (((-0.149067 51.478260,-0.149066 ...",100.000000,0.000003,-1
96,44001072,Deodar Road,3666,16,"MULTIPOLYGON (((-0.210842 51.466224,-0.211895 ...",44004076,Putney Embankment Cons Area,COA00000867,376,"MULTIPOLYGON (((-0.210842 51.466224,-0.212007 ...",0.000000,0.000008,-1
97,44001072,Deodar Road,3666,16,"MULTIPOLYGON (((-0.210842 51.466224,-0.211895 ...",44008350,Hurlingham,4,169,"MULTIPOLYGON (((-0.191600 51.472589,-0.191931 ...",0.066568,0.000008,-1
98,44001073,Old Devonshire Road,3807,16,"MULTIPOLYGON (((-0.145706 51.446804,-0.145830 ...",44000873,La Retraite,CA36,192,"MULTIPOLYGON (((-0.141921 51.447008,-0.141824 ...",0.010356,0.000003,-1
