In [None]:
import pandas as pd
import numpy as np
import geopy.distance
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading (duplicated) dataset
boats_df = pd.read_csv('Boating-Non-Dedup-19.05.21.csv', index_col=[0])
boats_df.shape

In [None]:
# Preparing data for deduplication
boats_df['Coord Pair'] = list(zip(boats_df['Latitude'], boats_df['Longitude']))
boats_df = boats_df.drop_duplicates('Coord Pair').reset_index(drop=True)
boats_df['Neighbour Distance'] = 0
dedup_df = boats_df.copy()

In [None]:
filter_distance = 200 # Real distance in meters. Used as a filtering threshold
coord_distance = 0.003 # Equals to ~300 meters.
print('Number of records BEFORE deduplication:', boats_df.shape[0])

# while there are points closer than filtering distance to each other
while dedup_df['Neighbour Distance'].min() < filter_distance: 
    
    for idx in dedup_df.index:
        if idx not in dedup_df.index:
            continue
        
        # Choosing the closest points based on the coordinates (not real distance btw locations) 
        closest_df = dedup_df[(dedup_df['Longitude']<dedup_df['Longitude'].loc[idx]+coord_distance) &
                             (dedup_df['Longitude']>dedup_df['Longitude'].loc[idx]-coord_distance) &
                             (dedup_df['Longitude']!=dedup_df['Longitude'].loc[idx]) &

                             (dedup_df['Latitude']<dedup_df['Latitude'].loc[idx]+coord_distance)&
                             (dedup_df['Latitude']>dedup_df['Latitude'].loc[idx]-coord_distance)&
                             (dedup_df['Latitude']!=dedup_df['Latitude'].loc[idx])]

        dists = []
        for sub_idx in closest_df.index:
            coords_1 = dedup_df['Coord Pair'].loc[idx] # Lat/Lon of the origin
            coords_2 = dedup_df['Coord Pair'].loc[sub_idx] # Lat/Lon of the closest point
            
            # Calculating real distance btw locations 
            distance = geopy.distance.distance(coords_1, coords_2).m
            dists.append(distance)
            
            # Conditions to choose which points to delete, from paddle.com or scraped from other sources
            if distance <= filter_distance and dedup_df['Source'].loc[sub_idx] != 'Paddle':
                try:
                    dedup_df.drop(sub_idx, inplace=True)
                except:
                    pass

            elif distance <= filter_distance and dedup_df['Source'].loc[sub_idx] == 'Paddle' and dedup_df['Source'].loc[idx] == 'Paddle':
                try:
                    dedup_df.drop(sub_idx, inplace=True)
                except:
                    pass
        
        # Assigning the distance to the nearest point on the map
        if len(dists)>0:
            dedup_df['Neighbour Distance'].loc[idx] = np.min(dists)
        else:
            dedup_df['Neighbour Distance'].loc[idx] = np.nan

print('Number of records AFTER deduplication:', dedup_df.shape[0])
print('How many points were closer than {} meters to other points: {}'.format(filter_distance, boats_df.shape[0] - dedup_df.shape[0]))