In [1]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from requests import get
import math

### Documentation of the api used to fetch altitudes (GET API): https://github.com/Jorl17/open-elevation

In [29]:
#create a training dataset for Florida species over the US
fl_train=train.loc[train['species_present'].isin(species_fl)].copy().drop('Unnamed: 0.1', axis=1)
print(len(fl_train))
fl_train.to_csv('train_USA_FLspecies.csv')

34815


In [10]:
#load train US
train=pd.read_csv('train_USA_FLspecies.csv')
old_fl=pd.read_csv('habitats_fl.csv')
species_fl=list(set(train.loc[train['state']=='Florida']['species_present'].tolist()))
len(species_fl)

36

In [19]:
#reverse geocode method is more precise than the boundaries we used arbitrarily
print(len(train.loc[train['state']=='Florida']))
print(len(fl))

3490
3546


In [2]:
#load test US
test=pd.read_csv('test_USA.csv')

In [3]:
#elevation retrieval function
# script for returning elevation from lat, long, based on open elevation data
# which in turn is based on SRTM
def get_elevation(lat = None, long = None):
    '''
        A custom script designed to retrieve elevation data in meters based on latitude and longitude coordinates.
    '''
    if lat is None or long is None: return None
    
    query = ('https://api.open-elevation.com/api/v1/lookup'
             f'?locations={lat},{long}')
    
    # Making a request with a 20-second timeout limit for sluggish responses
    r = get(query, timeout = 20)

    # Parsing the JSON response only for status codes 200 or 201
    if r.status_code == 200 or r.status_code == 201:
        elevation = pd.json_normalize(r.json(), 'results')['elevation'].values[0]
    else: 
        elevation = None
    return elevation

## Getting elevation data with a GET API loop

In [32]:
#Get a full list of altitudes from the Florida training dataset
ids=train.index.tolist()
altitudes=[]


for i in tqdm(ids,position=0, leave=True):
    lat,lon=train['Lat'][i],train['Lon'][i]
    alt=get_elevation(lat,lon)
    altitudes.append(alt)

100%|██████████| 34815/34815 [1:40:39<00:00,  5.76it/s]   


In [41]:
#retry for empty indicies
empty=[i for i in range(len(altitudes)) if altitudes[i] == None]
print(f'{len(empty)} gaps pending...\n')

for i in tqdm(empty):
    locate=ids[i]
    lat,lon=train['Lat'][locate],train['Lon'][locate]
    alt=get_elevation(lat,lon)
    altitudes[i]=alt
    #if altitudes[i]!=None:
    #    print(f'Succesful replacement at id {i}.')
    #else:
    #    print(f'Failed replacement at id {i}.')

25 gaps pending...



100%|██████████| 25/25 [00:02<00:00,  8.85it/s]


## Save results

In [8]:
#update the dataset
train['Alt']=altitudes

In [9]:
#save the data
train.to_csv('train_USA_FLspecies.csv')

In [21]:
len(altitudes)

21244

In [12]:
altitudes=list(train['Alt'])
empty=[i for i in range(len(altitudes)) if math.isnan(altitudes[i])]

In [13]:
empty

[]

## Adjusting values not returned

In [26]:
#test a set of coordinates
print(f'Coordinates: {test["Lat"][ids[84]]} , {test["Lon"][ids[84]]}.')
print(f'Altitude: {get_elevation(test["Lat"][ids[84]],test["Lon"][ids[84]])}')

Coordinates: 14.617134 , -174.0137.
Altitude: None


In [4]:
#missing coordinates found a day later, reload the dataframe
missing=pd.read_csv("train_USA_FLspecies.csv")
altitudes=list(missing['Alt'])

In [7]:
#retry for empty indicies (nan method)
ids=train.index.tolist()
#empty=[i for i in range(len(altitudes)) if math.isnan(altitudes[i])] #only for the first run
empty=[i for i in range(len(altitudes)) if altitudes[i] == None]
print(f'{len(empty)} gaps pending...\n')

mod=0.00001 #set this to zero if you don't want to slightly shift coordinates to get an approximate result

for i in empty:
    locate=ids[i]
    lat,lon=train['Lat'][locate]-mod,train['Lon'][locate]-2*mod #the minus term is used to slightly modify the coordinates in order to get a response from the api
    alt=get_elevation(lat,lon)
    altitudes[i]=alt
    if altitudes[i]!=None:
        print(f'Succesful replacement at id {i}.')
    else:
        print(f'Failed replacement at id {i}.')

1 gaps pending...

Succesful replacement at id 22848.


## Assign elevation levels

In [2]:
#load test and train
train=pd.read_csv('train_USA_FLspecies.csv')
test=pd.read_csv('test_USA.csv')

In [20]:
tr_alt=list(train['Alt'])
te_alt=list(test['Alt'])
print(min(tr_alt),max(tr_alt))
print(min(te_alt),max(te_alt))
print('#####')
tr_low=[x for x in tr_alt if 0 < x < 25]
te_low=[x for x in te_alt if 0 < x < 25]
print(len(tr_low))
print(len(tr_low))

-82.0 4149.0
-71.0 3947.0
#####
7659
7659


#### Details of chosen elevation levels
0. **Below 0.0:** sub-sea level

1. **Between 0.0 and 25.0:** coastal lowlands and plains Chapter 21-Ecological subregions of the United 
States (capped by max altitude in Western Florida coastal lowlands)

2. **Between 25 and 100:** lowlands (and the rest of the levels: mainly inspired by chapter 33 and the book "The Biology of Alpine Habitats")

3. **Between 100 and 500:** foothill zones 

4. **Between 500 and 1000:** uphill zones 

5. **Between 1000 and 1500:** lower montane 

6. **Between 1500 and 2000:** montane 

7. **Between 2000 and 2500:** upper montane 

8. **Between 2500 and 3000:** lower alpine 

9. **3000 and above:** alpine

In [20]:
#extract altitudes
tr_alt=list(train['Alt'])
te_alt=list(test['Alt'])

In [19]:
#filter and assign levels
def alt_zonation(alt):
    zones=alt.copy()
    for i in range(len(alt)):
        if alt[i] < 0.0:
            zones[i]=0
        elif 0.0 <= alt[i] < 25.0:
            zones[i]=1
        elif 25.0 <= alt[i] < 100.0:
            zones[i]=2
        elif 100.0 <= alt[i] < 500.0:
            zones[i]=3
        elif 500.0 <= alt[i] < 1000.0:
            zones[i]=4
        elif 1000.0 <= alt[i] < 1500.0:
            zones[i]=5
        elif 1500.0 <= alt[i] < 2000.0:
            zones[i]=6
        elif 2000.0 <= alt[i] < 2500.0:
            zones[i]=7
        elif 2500.0 <= alt[i] < 3000.0:
            zones[i]=8
        elif 3000.0 <= alt[i]:
            zones[i]=9
            
    return zones

In [23]:
#add elevation level columns to the data
tr_zone=alt_zonation(tr_alt)
te_zone=alt_zonation(te_alt)
train['Alt_zone']=tr_zone
test['Alt_zone']=te_zone

In [32]:
#check all levels exist in the data
for i in range(10):
    print(i in list(test['Alt_zone']))

True
True
True
True
True
True
True
True
True
True


In [38]:
#save US-wide florida species train data

#train.to_csv('train_USA_FLspecies.csv')
#test.to_csv('test_USA.csv')

### Assign altitude zonation to the full test data


In [2]:
test=pd.read_csv('predictable_coordinates_USA.csv')
full=pd.read_csv('test_USA_full.csv')

In [138]:

#debugging a decimal issue: I don't know why but I have more decimal places in the full set, which means I have to find rounded approximates
#fixed, it seems the biggest innate level of rounding in test_USA in 3 (longitude), so let's try to use that
#rounding errors, simplifying to 2 decimals
#again... 1 decimal!

full_zones=[]

for i in tqdm(range(len(full)),leave=True,position=0):
    lat=round(full['Lat'][i],1)
    lon=round(full['Lon'][i],1)
    test_cords=round(test[['Lat','Lon']],1).to_numpy().tolist()
    ids=[i for i in range(len(test_cords)) if test_cords[i]==[lat,lon]]
    if len(ids) == 0 or len(ids) > 1:
        print(f'Warning! Error encountered when matching coordinates at id {i}.')
        break
    full_zones.append(test['Alt_zone'][ids[0]])

print(len(full_zones)==len(full))

100%|██████████| 141725/141725 [1:28:25<00:00, 26.71it/s]

True





In [3]:
#have the coordinates match
test_cords=test[['Lat','Lon']].to_numpy().tolist()
rounded_test_cords=round(test[['Lat','Lon']],1).to_numpy().tolist()
full_lats=[]
full_lons=[]
for i in tqdm(range(len(full)),leave=True,position=0):
    lat=round(full['Lat'][i],1)
    lon=round(full['Lon'][i],1)
    
    ids=[i for i in range(len(test_cords)) if rounded_test_cords[i]==[lat,lon]]
    
    if len(ids) == 0 or len(ids) > 1:
        print(f'Warning! Error encountered when matching coordinates at id {i}.')
        break
    
    full_lats.append(test_cords[ids[0]][0])
    full_lons.append(test_cords[ids[0]][1])

print(len(full_lats)==len(full_lons) and len(full_lats)==len(full)))

100%|██████████| 141725/141725 [06:04<00:00, 389.30it/s]


TypeError: object of type 'bool' has no len()

In [9]:
print(len(full_lats)==len(full_lons) and len(full_lats)==len(full))
full['Lat']=full_lats
full['Lon']=full_lons

True


In [16]:
full.to_csv('test_USA_full.csv',index=False)

In [137]:
#investigate specific coordinates
lat=round(full['Lat'][422],1)
lon=round(full['Lon'][422],1)
test_cords=round(test[['Lat','Lon']],1).to_numpy().tolist()

ids=[i for i in range(len(test_cords)) if test_cords[i]==[lat,lon]]
print(lat,lon,ids)


38.0 -83.3 [425]


### Minor dataset adjustments and addition of 'presence' column for the test data

In [139]:
full['Alt_zone']=full_zones

In [140]:
full.to_csv('test_USA_full.csv')

In [2]:
full=pd.read_csv('test_USA_full.csv')

In [14]:
full.to_csv('test_USA_full.csv')