In [19]:
import os
import pandas as pd

Help method for reading and formating data

In [20]:
def read_and_format(filePath, columns_to_remove, lon_column_name, lat_column_name):
    df = pd.read_csv(filePath, delimiter=',')
    
    # removing unnecessary columns
    df = df.drop(columns_to_remove, axis=1)
    # renaming columns
    df.rename(columns={lon_column_name:'longitude'}, inplace = True) 
    df.rename(columns={lat_column_name:'latitude'}, inplace = True)    
    # adding species columns
    df.insert(0, 'species', 'griffon_vulture', True)

    return df

Help method for removing coordinates that are near each other (removes only other coordinate)

In [21]:
import numpy as np
from math import ceil
from scipy.spatial import distance
from haversine import haversine, Unit

def remove_near_coordinates(df, min_distance=50, count=1000):
    print(len(df.index))
    
    iteration_count = ceil(len(df.index) / count)
    idx_out = list()

    for i in range(iteration_count):
        lower_bound = i * count
        mini_df = df.iloc[lower_bound:lower_bound + count,1:3]
        coords = mini_df.values

        distance_matrix = distance.cdist(coords, coords, lambda u, v: haversine(u, v, Unit.METERS)) # haversine(lat, lon, Unit.Meters)
        n_proxim = np.apply_along_axis(lambda x: np.count_nonzero(x < min_distance), 0, distance_matrix)
        idx = np.argsort(n_proxim).tolist()

        for i in idx:
            for j in range(i+1):
                if i not in idx_out:
                    if distance_matrix[i, j] < min_distance:
                        if i != j:
                            idx_out.append(j)

    pop_idx = sorted(np.unique(idx_out).tolist(), reverse=True)
    df = df.drop(df.index[pop_idx])

    print(len(df.index))
    return df

In [22]:
def clean(df):
    # removing NaN values
    df = df.dropna()    
    # removing duplicates
    df = df.drop_duplicates()
    # removing coordinates if distance between two is below given min_distance
    df = remove_near_coordinates(df)
    return df

Help method for writing cleaned data to final dataset file

In [23]:
def to_csv(df):
    df.to_csv('../FinalDatasets/griffon_vulture.csv', mode='a', index=False, header=False)

Main

In [24]:
datasets_info = [('../MovebankDatasets/e-ObsGPRSHimalayanGriffon-Bhutan-MPIAB',
             ['event-id', 'visible', 'timestamp', 'bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp'],
             'location-long', 'location-lat')]

for dataset_info in datasets_info: 
    file_path = dataset_info[0]
    dataset_files = os.listdir(file_path)

    for file in dataset_files:
        df = read_and_format(file_path + '/' + file, dataset_info[1], dataset_info[2], dataset_info[3])
        df = clean(df)
        
        to_csv(df)

544726


KeyboardInterrupt: 