In [3]:
import geopandas as gpd
shapefile = gpd.read_file("PA19_refined_179076/CAR19_refined_179076.shp")
df = shapefile.to_crs(6933)

In [4]:
invalids = []
for idx, row in df.iterrows():
    if not row['geometry'].buffer(0).is_valid:
        invalids.append(idx)

In [5]:
invalids

[]

In [6]:
for ID in invalids:
    print(df.loc[ID, 'cod_imovel'])

In [7]:
from shapely.geometry import Polygon
from shapely.ops import unary_union
from tqdm import tqdm
from rtree import index
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import math
from ftfy import fix_text
from unidecode import unidecode

In [16]:
info_dict = {ID: {} for ID in set(df['cod_imovel'])}
for _, row in df.iterrows():
    info_dict[row['cod_imovel']]['landholder'] = row['landholder']
    info_dict[row['cod_imovel']]['cpf'] = row['CPF']
    info_dict[row['cod_imovel']]['date'] = row['dat_pr_cad']

In [17]:
def similar_names(cod_1, cod_2):
    
    name_1 = info_dict[cod_1]['landholder']
    name_2 = info_dict[cod_2]['landholder']

    if name_1 == name_2:
        return True
    
    if name_1 == None or name_2 == None:
        return False
    
    if type(name_1) == float or type(name_2) == float:
        return False
    
    name_1 = unidecode(fix_text(name_1)).upper()
    name_2 = unidecode(fix_text(name_2)).upper()
    
    name_1_set = set((name_1.split())[1:])
    name_2_set = set((name_2.split())[1:])
    res_set = name_1_set.intersection(name_2_set)

    for name in res_set:
        if len(name) > 2 and name not in ['LTDA', 'S/A', 'S.A', 'S.A.', 'DOS', 'DAS']:
            return True
    
    return False

In [18]:
IDs = [key for key in list(df['cod_imovel'])]
polygons = [row['geometry'].buffer(0) for _, row in df.iterrows()]

In [19]:
from rtree import index
idx_polys = index.Index()
for pos, cell in enumerate(polygons):
    idx_polys.insert(pos, cell.bounds)

In [20]:
AREA_SIMILARITY = 0.70
OVERLAP_SIMILARITY = 0.50

In [21]:
sus_dups = set({})
multiple_dups = []

for i, poly in enumerate(polygons):

    candidate_indices = [pos for pos in idx_polys.intersection(poly.bounds) if pos != i]
    ID = IDs[i]
    
    duplicate_IDs = []
    for index in candidate_indices:
        sus_dup = polygons[index]
        sus_dup_ID = IDs[index]
        if poly.equals(sus_dup):
            duplicate_IDs.append(sus_dup_ID)
            
            ## add in criterion for both area and overlap
        elif (abs(poly.area - sus_dup.area) < (1-AREA_SIMILARITY)*poly.area and abs(poly.area - sus_dup.area) < (1-AREA_SIMILARITY)*sus_dup.area and
                abs(poly.intersection(sus_dup).area - poly.area) < (1-OVERLAP_SIMILARITY)*poly.area and abs(poly.intersection(sus_dup).area - poly.area) < (1-OVERLAP_SIMILARITY)*sus_dup.area):
            if similar_names(ID, sus_dup_ID) or (info_dict[ID]['cpf'] == info_dict[sus_dup_ID]['cpf']):
                duplicate_IDs.append(sus_dup_ID)
            
    years_dict = {}
    if len(duplicate_IDs) > 0:
        
        # get years of ID and sus_dup_IDs
        years_dict[info_dict[ID]['date']] = ID
        for duplicate_ID in duplicate_IDs:
            years_dict[info_dict[duplicate_ID]['date']] = duplicate_ID
            
        # add all but the newest one to the set
        max_date = max(years_dict.keys())
        for key in years_dict.keys():
            if key != max_date:
                sus_dups.add(years_dict[key])
                
        if len(duplicate_IDs) > 1:
            multiple_dups.append(ID)

In [22]:
out_dict = {'dups': list(sus_dups)}

In [23]:
df_out = pd.DataFrame(out_dict)

In [24]:
df_out.to_excel('CAR19_PA_sus_dups_70_50.xlsx')

In [25]:
df_out

Unnamed: 0,dups
0,PA-1508308-6B3E6BB732104FC3915EEB84A5ACE433
1,PA-1505064-46F9E749D3E14D799F7D3B2FFDD5D10F
2,PA-1505486-7BC1D8F8A5B04CF79E78D472874B7125
3,PA-1505064-C26137A2B63F4A66BEC77659F47E6292
4,PA-1505064-923BFC8113D2475CBCC8B0D111CE3509
...,...
1822,PA-1506005-7EDC641663494963B815D21C41C6E16C
1823,PA-1503606-168DD3F50179425E9804A080CAE96FAA
1824,PA-1507300-1F5605E334B54D0CADD1837B827625C6
1825,PA-1505064-59B4D523D9C344A3974075DBE724E19A
