In [46]:
from collections import defaultdict
from datetime import datetime, date, time, timedelta
import json
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import MultipleLocator
import matplotlib.dates as mdates

from sklearn.cluster import DBSCAN

sns.set_theme()
plt.style.use('Solarize_Light2')

In [95]:
ALL_IDS = pd.read_csv('idents_expanded.csv')
ALL_IDS.insert(3, 'datetime', pd.to_datetime(ALL_IDS['date']))

standard_times = ALL_IDS['datetime'].apply(lambda x: (x - x.utcoffset()).replace(tzinfo=None))
ALL_IDS['datetime'] = standard_times
ALL_IDS['date'] = standard_times.apply(lambda x: x.date())
ALL_IDS.insert(5, 'time', standard_times.apply(lambda x: x.time()))

# ALL_IDS['coordinates'] = ALL_IDS['coordinates'].apply(lambda test: [float(x) for x in test.strip('[]').split(', ')])
ALL_IDS['places'] = ALL_IDS['places'].apply(lambda test: [int(x) for x in test.strip('[]').split(', ')])
ALL_IDS.sort_values(by='date', inplace=True)
ALL_IDS.head()

Unnamed: 0,observation,identifier,username,datetime,date,time,taxon_id,taxon,rank,rank_level,previous_taxon_id,current,disagreement,category,vision,hidden,latitude,longitude,places
7,5607,357,annetanne,2010-02-19 09:26:16,2010-02-19,09:26:16,51702,Coccinella septempunctata,species,10,,True,,improving,False,False,-122.173187,37.492584,"[1, 14, 1919, 9853, 50422, 51186, 54321, 59613..."
10,5783,308,triplep220,2010-02-28 08:37:33,2010-02-28,08:37:33,48486,Coccinellidae,family,30,,True,,supporting,False,False,-121.786812,37.15065,"[1, 14, 1250, 3060, 9853, 51330, 54321, 59613,..."
18,12381,477,loarie,2011-03-07 07:24:46,2011-03-07,07:24:46,48486,Coccinellidae,family,30,,True,,improving,False,False,-122.581106,37.897059,"[1, 14, 2319, 5603, 9853, 50422, 54321, 59613,..."
33,18740,642,biosam,2011-05-29 18:14:10,2011-05-29,18:14:10,48486,Coccinellidae,family,30,,True,,improving,False,False,-122.811806,38.377495,"[1, 14, 2764, 9853, 50422, 54321, 54678, 59613..."
36,18741,642,biosam,2011-05-29 18:29:36,2011-05-29,18:29:36,48486,Coccinellidae,family,30,,True,,improving,False,False,-122.811806,38.377495,"[1, 14, 2764, 9853, 50422, 54321, 54678, 59613..."


In [97]:
GEO_IDS = gpd.GeoDataFrame(ALL_IDS, geometry=gpd.points_from_xy(ALL_IDS['latitude'], ALL_IDS['longitude']))
GEO_IDS['geometry'] = GEO_IDS['geometry'].set_crs("WGS84")
GEO_IDS.iloc[0]

observation                                                       5607
identifier                                                         357
username                                                     annetanne
datetime                                           2010-02-19 09:26:16
date                                                        2010-02-19
time                                                          09:26:16
taxon_id                                                         51702
taxon                                        Coccinella septempunctata
rank                                                           species
rank_level                                                          10
previous_taxon_id                                                  NaN
current                                                           True
disagreement                                                       NaN
category                                                     improving
vision

In [18]:
TAXA = pd.read_json('observations/taxa.json', orient='index', dtype={'ancestor_ids': tuple})
TAXA = TAXA.drop(['current_synonymous_taxon_ids', 'is_active', 'created_at', 'endemic', 'threatened', 'native', 'introduced'], axis='columns').set_index('id')
TAXA.head()

Unnamed: 0_level_0,ancestor_ids,name,preferred_common_name,rank,rank_level,atlas_id,observations_count,complete_species_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
333790,"[48460, 1, 47120, 372739, 47158, 184884, 47208...",Coccinellinae,Common Lady Beetles,subfamily,27,,531614,
48484,"[48460, 1, 47120, 372739, 47158, 184884, 47208...",Harmonia axyridis,Asian Lady Beetle,species,10,1241.0,196974,
124431,"[48460, 1, 47120, 372739, 47158, 184884, 47208...",Cycloneda sanguinea,Spotless Lady Beetle,species,10,28574.0,13664,
48487,"[48460, 1, 47120, 372739, 47158, 184884, 47208...",Harmonia,Greater Lady Beetles,genus,20,,207464,
354799,"[48460, 1, 47120, 372739, 47158, 184884, 47208...",Henosepilachna vigintioctopunctata,Hadda Beetle,species,10,28708.0,2935,


In [44]:
# the taxon IDs go from highest to lowest, and include self as the last ID

def taxon_dist(x, y):
    dist = 0
    i = -1
    # TODO: fold subspecies/etc into species
    if x['rank_level'] == y['rank_level']:
        print(x['name'], "==", y['name'])
        print(x['ancestor_ids'], "\n", y['ancestor_ids'])
        while x['ancestor_ids'][i] != y['ancestor_ids'][i]:
            i -= 1
            dist += 1
        return dist
    elif x['rank_level'] > y['rank_level']: # larger number = higher in tree
        print(x['name'], ">", y['name'])
        print(x['ancestor_ids'], "\n", y['ancestor_ids'])
        for yanc in reversed(y['ancestor_ids']):
            if yanc in x['ancestor_ids']:
                return dist
            dist += 1
    else: 
        print(x['name'], "<", y['name'])
        print(x['ancestor_ids'], "\n", y['ancestor_ids'])
        for xanc in reversed(x['ancestor_ids']):
            if xanc in y['ancestor_ids']:
                return dist
            dist += 1

taxon_dist(TAXA.loc[124431], TAXA.loc[354096])

Cycloneda sanguinea < Scymninae
[48460, 1, 47120, 372739, 47158, 184884, 47208, 71130, 372852, 471714, 48486, 333790, 333796, 55490, 124431] 
 [48460, 1, 47120, 372739, 47158, 184884, 47208, 71130, 372852, 471714, 48486, 354096]


4

In [None]:

X = np.array([]).T
db = DBSCAN(eps=0.1, min_samples=4).fit(X)