Help method for reading and formating data

In [24]:
import pandas as pd
from datetime import datetime

def read_and_format(file_path, start_date, end_date, columns_to_remove):
    df = pd.read_csv(file_path, delimiter=',', low_memory=False)
    
    # remove rows that are read outside the specified datetime interval
    df['timestamp_datetime'] = pd.to_datetime(df['timestamp'])
    df = df.loc[(df['timestamp_datetime'] >= start_date) & (df['timestamp_datetime'] <= end_date)]
    
    # removing unnecessary columns
    df = df.drop(columns_to_remove, axis=1)
    df = df.drop('timestamp_datetime', axis=1)
    
    # renaming columns
    df.rename(columns={'location-long':'longitude'}, inplace = True) 
    df.rename(columns={'location-lat':'latitude'}, inplace = True)    

    return df

Help method for removing coordinates that are near each other (removes only other coordinate)

In [25]:
import numpy as np
from math import ceil
from scipy.spatial import distance
from haversine import haversine, Unit

def remove_near_coordinates(df, min_distance=50, count=1000):
    print(len(df.index))
    
    df = df.sort_values(by =['longitude', 'latitude'])

    iteration_count = ceil(len(df.index) / count)
    idx_out = list()

    for i in range(iteration_count):
        lower_bound = i * count
        mini_df = df.iloc[lower_bound:lower_bound + count,1:3]
        coords = mini_df.values

        distance_matrix = distance.cdist(coords, coords, lambda u, v: haversine(u, v, Unit.METERS)) # haversine(lat, lon, Unit.Meters)
        n_proxim = np.apply_along_axis(lambda x: np.count_nonzero(x < min_distance), 0, distance_matrix)
        idx = np.argsort(n_proxim).tolist()

        for i in idx:
            for j in range(i+1):
                if i not in idx_out:
                    if distance_matrix[i, j] < min_distance:
                        if i != j:
                            idx_out.append(j)

    pop_idx = sorted(np.unique(idx_out).tolist(), reverse=True)
    df = df.drop(df.index[pop_idx])

    print(len(df.index))
    return df

Help method for removing outliers

In [26]:
import copy 
import scipy as sp
from scipy.stats import chi2
from sklearn.covariance import MinCovDet

#Robust Mahalonibis Distance
def robust_mahalanobis_method(df):
    #Minimum covariance determinant
    rng = np.random.RandomState(0)
    real_cov = np.cov(df.values.T)
    X = rng.multivariate_normal(mean=np.mean(df, axis=0), cov=real_cov, size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_ #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd) #inverse covariance metric
    
    #Robust M-Distance
    x_minus_mu = df - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = np.sqrt(mahal.diagonal())
    
    #Flag as outlier
    outlier = []
    C = np.sqrt(chi2.ppf((1-0.001), df=df.shape[1]))
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return df.drop(index=df.index[outlier])

In [27]:
def clean(df):
    # removing NaN values
    df = df.dropna()    
    # removing duplicates
    df = df.drop_duplicates()
    # removing outliers
    #df = robust_mahalanobis_method(df)
    # removing coordinates if distance between two is below given min_distance
    #df = remove_near_coordinates(df)
    return df

Help method for writing cleaned data to final dataset file

In [28]:
def to_csv(filePath, df):
    df.to_csv(filePath, mode='a', index=False, header=False)

In [29]:
def format_final_dataset(file_path):
    df = pd.read_csv(file_path, delimiter=',') 
    df = df.drop_duplicates()
    #df = remove_near_coordinates(df)
    df.insert(0, 'species', 'griffon_vulture', True)
    
    os.remove(file_path)
    to_csv(file_path, df)

Main

In [None]:
import os

datasets_info = [
    ('../MovebankDatasets/e-ObsGPRSHimalayanGriffon-Bhutan-MPIAB', ['event-id', 'visible', 'timestamp', 'bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/EurasianGriffonVultures1HzHUJ(Israel)',['event-id','visible','timestamp','bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','height-raw','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','taxon','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/Griffonvulture[fdlmes.gr]',['event-id','visible','timestamp','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/GriffonVultureAlbstadtSalzburg(Gypsi)', ['event-id','visible','timestamp','bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','manually-marked-outlier','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/High-altitudeflightsofHimalayanvultures(datafromSherubetal.2016)', ['event-id','visible','timestamp','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','ground-speed','heading','height-above-ellipsoid','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/LifeTrackGriffonVultureCroatia', ['event-id','visible','timestamp','bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/Long-rangeadultmovementsof3vulturespecies(datafromSpiegeletal.2015)', ['event-id','visible','timestamp','ground-speed','heading','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/RaptorsNABUMoessingenpublic', ['event-id','visible','timestamp','bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp']),
    ('../MovebankDatasets/SoaringflightinEurasiangriffonvultures(HUJ)(datafromHarelandNathan,2018)', ['event-id','visible','timestamp','ground-speed','heading','height-raw','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp'])
]
final_dataset_file_path = '../FinalDatasets/Coordinates/griffon_vulture.csv'

start_date = datetime(2010, 1, 1, 0, 0)
end_date = datetime(2018, 12, 31, 23, 59)

for dataset_info in datasets_info: 
    file_path = dataset_info[0]
    dataset_files = os.listdir(file_path)

    for file in dataset_files:
        df = read_and_format(file_path + '/' + file, start_date, end_date, dataset_info[1])
        df = clean(df)
        
        to_csv(final_dataset_file_path, df)
        
format_final_dataset(final_dataset_file_path)