# Compare geocoding results with curated geopositions
- Make use of BioTIME database and mordecai
- Calculate the geodesic distance between computed geoposition and curated longitude, latitude coordinates

Ensure to reload Python scripts dynamically

In [1]:
%load_ext autoreload
%autoreload 2

Set up mordecai's geoparser

In [2]:
from mordecai import Geoparser
geo = Geoparser()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


Load examples from biotime via a Python helper

In [3]:
import biotime_examples

Show all examples

In [4]:
biotime_examples.biotime_sample_sites.keys()

dict_keys(['Anderson et al. - 2011', 'Farah et al. - 2014', 'Holmes et al. - 1986', 'Rocha et al. - 2017', 'Thomsen et al. - 2016', 'Woods - 2009'])

Prepare a text for geocoding

In [5]:
text = biotime_examples.biotime_sample_sites['Anderson et al. - 2011']
text

'This historical data set consists of 44 permanent 1-m 2 quadrats located on northern mixed prairie in Miles City, Montana, USA.'

Geocoding of the entire text and show the resulting list of locations


In [6]:
geocoding_res = geo.geoparse(text)
geocoding_res

[{'word': 'Miles City',
  'spans': [{'start': 102, 'end': 112}],
  'country_predicted': 'USA',
  'country_conf': 0.999931,
  'geo': {'admin1': 'Montana',
   'lat': '46.40834',
   'lon': '-105.84056',
   'country_code3': 'USA',
   'geonameid': '5666176',
   'place_name': 'Miles City',
   'feature_class': 'P',
   'feature_code': 'PPLA2'}},
 {'word': 'Montana',
  'spans': [{'start': 114, 'end': 121}],
  'country_predicted': 'USA',
  'country_conf': 0.9991698,
  'geo': {'admin1': 'Montana',
   'lat': '47.00025',
   'lon': '-109.75102',
   'country_code3': 'USA',
   'geonameid': '5667009',
   'place_name': 'Montana',
   'feature_class': 'A',
   'feature_code': 'ADM1'}},
 {'word': 'USA',
  'spans': [{'start': 123, 'end': 126}],
  'country_predicted': 'USA',
  'country_conf': 0.999931,
  'geo': {'admin1': 'California',
   'lat': '34.00474',
   'lon': '-117.33588',
   'country_code3': 'USA',
   'geonameid': '7195491',
   'place_name': 'Sgi-Usa Riverside Community Center',
   'feature_class': '

Output the actual geolocation.

In [7]:
actual_location = biotime_examples.geocoordinates_from_study('Anderson et al. - 2011')
actual_location

(46.316667, -105.8)

Infer the country of the sampling site

In [8]:
geo.infer_country(text)

[{'text': 'Miles City',
  'label': 'United States',
  'word': 'Miles City',
  'spans': [{'start': 102, 'end': 112}],
  'features': {'maj_vote': 'USA',
   'word_vec': 'USA',
   'first_back': 'USA',
   'most_alt': 'USA',
   'most_pop': 'USA',
   'ct_mention': 'USA',
   'ctm_count1': 2,
   'ct_mention2': '',
   'ctm_count2': 0,
   'wv_confid': 17.08477783203125,
   'class_mention': 'P',
   'code_mention': ''},
  'country_predicted': 'USA',
  'country_conf': 0.999931,
  'all_countries': array(['USA'], dtype='<U3'),
  'all_confidence': array([0.999931], dtype=float32)},
 {'text': 'Montana',
  'label': 'United States',
  'word': 'Montana',
  'spans': [{'start': 114, 'end': 121}],
  'features': {'maj_vote': 'USA',
   'word_vec': 'USA',
   'first_back': 'ESP',
   'most_alt': 'USA',
   'most_pop': 'USA',
   'ct_mention': 'USA',
   'ctm_count1': 2,
   'ct_mention2': '',
   'ctm_count2': 0,
   'wv_confid': 46.95623016357422,
   'class_mention': '',
   'code_mention': ''},
  'country_predicted': '

Geocoding for another study

In [9]:
example_rocha = biotime_examples.biotime_sample_sites["Rocha et al. - 2017"]
print(example_rocha)
computed_geolocation_rocha = geo.geoparse(example_rocha)
computed_geolocation_rocha

Fieldwork was conducted at the BDFFP, located *80 km north of Manaus (2°30 0 S, 60°W), Brazil (see Fig. S1 in the Online Supplementary Material). The area is classified as tropical moist forest, and is characterized by a mosaic of terra firme rainforest, secondary regrowth, and primary forest fragments. Annual rainfall varies from 1900 to 3500 mm, with a dry season between June and October . The forest fragments were isolated from CF by distances of 80-650 m in the early 1980s, and are categorized into size classes of 1, 10 and 100 ha. Each fragment was re-isolated on 3-4 occasions prior to this study, most recently between 1999 and 2001 (Laurance et al. 2011). The matrix is composed of tall secondary forest dominated mainly by Vismia spp. and Cecropia spp.


[{'word': 'Manaus',
  'spans': [{'start': 62, 'end': 68}],
  'country_predicted': 'BRA',
  'country_conf': 0.993103,
  'geo': {'admin1': 'Amazonas',
   'lat': '-3.04361',
   'lon': '-60.01282',
   'country_code3': 'BRA',
   'geonameid': '6319328',
   'place_name': 'Manaus',
   'feature_class': 'A',
   'feature_code': 'ADM2'}},
 {'word': 'Brazil',
  'spans': [{'start': 87, 'end': 93}],
  'country_predicted': 'BRA',
  'country_conf': 0.9998105,
  'geo': {'admin1': 'NA',
   'lat': '-10',
   'lon': '-55',
   'country_code3': 'BRA',
   'geonameid': '3469034',
   'place_name': 'Federative Republic of Brazil',
   'feature_class': 'A',
   'feature_code': 'PCLI'}}]

Extract the GPS location from the geocoding result

In [10]:
from utils import extract_geolocation
computed_location = extract_geolocation(computed_geolocation_rocha)
computed_location

(-3.04361, -60.01282)

Extract gps coordinates from biotime data


In [11]:
actual_location_rocha = biotime_examples.geocoordinates_from_study("Rocha et al. - 2017")
actual_location_rocha

(-2.386381, -59.918769)

Calculate the distance between the extracted and the actual geolocation

In [12]:
import utils
utils.geodesic_distance(computed_location, actual_location_rocha)

73.42287288501343

Compute the differences in distances for all examples


In [13]:
for study_name, study_text in biotime_examples.biotime_sample_sites.items():
    print(f"Study: {study_name}")
    print(f"Text: {study_text}")
    geolocation_result = geo.geoparse(study_text)
    print(f"geo object: {geolocation_result}")
    extracted_location = utils.extract_geolocation(geolocation_result)
    print(f"extracted geolocation: {extracted_location}")
    actual_lat, actual_lon = biotime_examples.geocoordinates_from_study(study_name)
    print(f"distance in km: {utils.geodesic_distance(extracted_location, (actual_lat, actual_lon))}")
    print()

Study: Anderson et al. - 2011
Text: This historical data set consists of 44 permanent 1-m 2 quadrats located on northern mixed prairie in Miles City, Montana, USA.
geo object: [{'word': 'Miles City', 'spans': [{'start': 102, 'end': 112}], 'country_predicted': 'USA', 'country_conf': 0.999931, 'geo': {'admin1': 'Montana', 'lat': '46.40834', 'lon': '-105.84056', 'country_code3': 'USA', 'geonameid': '5666176', 'place_name': 'Miles City', 'feature_class': 'P', 'feature_code': 'PPLA2'}}, {'word': 'Montana', 'spans': [{'start': 114, 'end': 121}], 'country_predicted': 'USA', 'country_conf': 0.9991698, 'geo': {'admin1': 'Montana', 'lat': '47.00025', 'lon': '-109.75102', 'country_code3': 'USA', 'geonameid': '5667009', 'place_name': 'Montana', 'feature_class': 'A', 'feature_code': 'ADM1'}}, {'word': 'USA', 'spans': [{'start': 123, 'end': 126}], 'country_predicted': 'USA', 'country_conf': 0.999931, 'geo': {'admin1': 'California', 'lat': '34.00474', 'lon': '-117.33588', 'country_code3': 'USA', 'geo