In [36]:
import pandas as pd

In [37]:
df = pd.read_csv('../MovebankDatasets/EurasianGriffonVultures1HzHUJ(Israel)/2565-2565.csv', delimiter=',', low_memory=False)
# df.head()

In [38]:
df = df.drop(['event-id','visible','timestamp','bar:barometric-pressure','data-decoding-software','eobs:activity','eobs:activity-samples','eobs:battery-voltage','eobs:fix-battery-voltage','eobs:horizontal-accuracy-estimate','eobs:key-bin-checksum','eobs:speed-accuracy-estimate','eobs:start-timestamp','eobs:status','eobs:temperature','eobs:type-of-fix','eobs:used-time-to-get-fix','gps:dop','gps:satellite-count','ground-speed','heading','height-above-ellipsoid','height-raw','import-marked-outlier','mag:magnetic-field-raw-x','mag:magnetic-field-raw-y','mag:magnetic-field-raw-z','orientation:quaternion-raw-w','orientation:quaternion-raw-x','orientation:quaternion-raw-y','orientation:quaternion-raw-z','taxon','sensor-type','individual-taxon-canonical-name','tag-local-identifier','individual-local-identifier','study-name','utm-easting','utm-northing','utm-zone','study-timezone','study-local-timestamp'], axis=1)
#df.head()

In [39]:
df.rename(columns={'location-long':'longitude'}, inplace = True)
df.rename(columns={'location-lat':'latitude'}, inplace = True)
df = df.sort_values(by =['longitude', 'latitude'])
#df.head()

In [40]:
#df.insert(0, 'species', 'griffon_vulture', True)
#df.head()

### Removing duplicates

In [41]:
data_groups = df.groupby(df.columns.tolist())
size = data_groups.size().reset_index()
len(size[size[0] > 1])   # NUMBER OF DUPLICATES

846

In [42]:
print(len(df.index))

30970


In [43]:
df = df.drop_duplicates()

In [44]:
print(len(df.index))

12381


### NaN values

In [45]:
df = df.dropna()

In [46]:
print(len(df.index))

12380


### Remove geopoints on distance less than 50

In [47]:
import numpy as np
from math import ceil
from scipy.spatial import distance
from haversine import haversine, Unit

In [48]:
min_distance = 50

print(len(df.index))

count = 1000
iteration_count = ceil(len(df.index) / count)
idx_out = list()

for i in range(iteration_count):
    lower_bound = i * count
    mini_df = df.iloc[lower_bound:lower_bound + count,0:2]
    coords = mini_df.values

    distance_matrix = distance.cdist(coords, coords, lambda u, v: haversine(u, v, Unit.METERS)) # haversine(lat, lon, Unit.Meters)
    n_proxim = np.apply_along_axis(lambda x: np.count_nonzero(x < min_distance), 0, distance_matrix)
    idx = np.argsort(n_proxim).tolist()

    for i in idx:
        for j in range(i+1):
            if i not in idx_out:
                if distance_matrix[i, j] < min_distance:
                    if i != j:
                        idx_out.append(j)

pop_idx = sorted(np.unique(idx_out).tolist(), reverse=True)
df = df.drop(df.index[pop_idx])

print(len(df.index))

12380
11402


### Outliers detection

In [49]:

import plotly.express as px

fig = px.scatter(x=df['latitude'], y=df['longitude'])
fig.show()


Kvantili

In [50]:
def detect_outliers(df_in,cols):
    df = df_in.copy(deep=True)
    df['outlier'] = 0
    for col in cols:
        Q1=df[col].quantile(0.25)
        Q3=df[col].quantile(0.75)
        IQR=Q3-Q1
        Lower_Whisker = Q1-1.5*IQR
        Upper_Whisker = Q3+1.5*IQR
        df.loc[df[col]>Upper_Whisker,'outlier'] = 1
    return df

In [51]:
df_outl = detect_outliers(df,['longitude', 'latitude'])

In [58]:
df_outl = df_outl.loc[df_outl['outlier'] != 1]

fig = px.scatter(x=df_outl['latitude'], y=df_outl['longitude'])
fig.show()

Robust Mahalanobis distance

In [54]:
import copy 
import scipy as sp
from scipy.stats import chi2
from sklearn.covariance import MinCovDet

#Robust Mahalonibis Distance
def robust_mahalanobis_method(df):
    #Minimum covariance determinant
    rng = np.random.RandomState(0)
    real_cov = np.cov(df.values.T)
    X = rng.multivariate_normal(mean=np.mean(df, axis=0), cov=real_cov, size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_ #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd) #inverse covariance metric
    
    #Robust M-Distance
    x_minus_mu = df - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = np.sqrt(mahal.diagonal())
    
    #Flag as outlier
    outlier = []
    C = np.sqrt(chi2.ppf((1-0.001), df=df.shape[1]))
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md

In [55]:
outliers, md_rb = robust_mahalanobis_method(df)

In [59]:
df_outl2 = df.drop(index=df.index[outliers])

fig = px.scatter(x=df_outl2['latitude'], y=df_outl2['longitude'])
fig.show()