In [1]:
pip install pyinaturalist

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pyinaturalist import *
import math
import numpy as np

In [3]:
'''
The observations function only extracts data from first page in order to avoid exceeding the rate limit included in the API Recommended Practices, so need to loop through to access all observations
if working with Mexico then: place_id = 6793
if working with Colombia then: place_id = 7196
'''
def download_data(place_id):
    #finding the number of times that i need to loop through
    observations_dict = get_observations(place_id= place_id, subview= 'map', taxon_id=472290, per_page = 50, quality_grade="research")
    number_of_observations = observations_dict['total_results'] #int object
    iterations = math.ceil(number_of_observations/50)
    observations = observations_dict['results']

    #creating one list of observations
    for i in range(2,iterations+1):
        observation_dict = get_observations(place_id= place_id, subview= 'map', taxon_id=472290, page = i, per_page = 50, quality_grade="research")
        observation = observation_dict['results']
        observations += observation
    print("observation #: " + str(number_of_observations))

    return observations, number_of_observations


In [4]:
#at this point we have a list of dictionaries, each dictionary carrying information about an observation. this function gets a list of the important information from one dictionary
def get_data(data):
    columns = ['id', 'species_guess', 'positional_accuracy', 'location']
    items = list(map(data.get, columns))
    items += items.pop() #pop removes and returns the last element of the list, which is needed as the location is a list of the longitude and latitude, and so to break this up into two separate entries we can just removed location and use + to add it back in a divided way
    return items


#reshaping the array so that it is like a dataframe
def reshaped_array(observations_reduced, number_of_observations, columns, place_id):
  obs_decomposed = np.hstack([*observations_reduced])
  if len(columns) == len(observations_reduced[0]):
    observations_array = obs_decomposed.reshape(number_of_observations, len(columns))
  elif len(columns) == len(observations_reduced[0]) -1: 
    if place_id == 'any':
        observations_array = obs_decomposed.reshape(len(observations_reduced), len(columns)+1)
    else:
        observations_array = obs_decomposed.reshape(number_of_observations, len(columns)+1)
    columns_final = columns
    columns_final.remove('location')
    columns_final.append('latitude')
    columns_final.append('longitude')
  else:
    print('error - check the format of the data in the columns you have chosen')
    exit()
  return columns_final, observations_array

In [6]:
def main():
    countries = ["Mexico", "Colombia", "Texas"]
    place_ids = [6793, 7196 ,18]
    # countries = ["Mexico_and_texas_border"]
    # place_ids = ["any"]
    for i in place_ids:
        print(i)
        observations, number_of_observations = download_data(i)
        print("observations vector length: " + str(len(observations)))

        #select the columns of interest, unhashtag the first line to see the options
        #print(np.array(list(observations[0].keys())))
        columns = ['id', 'species_guess', 'positional_accuracy', 'location']

        #vectorise the get_data function
        #print(observations[0]['positional_accuracy'])
        v_get_data = np.vectorize(get_data, otypes=[list])


        #get the reduced data
        observations_reduced = v_get_data(observations)
        print("observations_reduced_length: " + str(len(observations_reduced)))

        #reshape the array
        columns_final, observations_array = reshaped_array(observations_reduced, number_of_observations, columns, i)

        #creating the dataframe
        observations_df = pd.DataFrame(observations_array, columns = columns_final)

        #adapt to match the gbif data
        observations_df['subFamily'] = 'Triatominae'
        new_columns = {'positional_accuracy' : 'coordinateUncertaintyInMeters', 'species_guess': 'scientificName'}
        observations_matching = observations_df.rename(columns = new_columns)

        #write to a df
        observations_matching.to_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_' + countries[place_ids.index(i)] + '_matching_gbif.csv', index = False )

In [7]:
main()

6793
observation #: 387
observations vector length: 387
observations_reduced_length: 387
7196
observation #: 31
observations vector length: 31
observations_reduced_length: 31
18
observation #: 657
observations vector length: 657
observations_reduced_length: 657


In [11]:
#concatenate the texas and mexico ones
df_mexico = pd.read_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_Mexico_matching_gbif.csv')
df_texas = pd.read_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_Texas_matching_gbif.csv')

df_texas_and_mexico = pd.concat([df_mexico, df_texas])
df_texas_and_mexico.to_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_Mexico_and_texas_matching_gbif.csv')

1044


In [9]:
# #add on for the texas and border data which I cut in QGIS from the mexico and texas data above. This can be done using the PYQGIS packages here

# observations_df = pd.read_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_mexico_border_and_texas.csv')
# #adapt to match the gbif data
# observations_df['subFamily'] = 'Triatominae'
# new_columns = {'positional_accuracy' : 'coordinateUncertaintyInMeters', 'species_guess': 'scientificName'}
# observations_matching = observations_df.rename(columns = new_columns)

# #write to a df
# observations_matching.to_csv('/Users/laurengomezcullen/Documents/Cambridge/Fourth/Project/final/data/possible_kissing_bug_data/inaturalist/inaturalist_observations_mexico_border_and_texas_matching_gbif.csv', index = False )
