# Parse additional xc metadata from the page.html files
- e.g. remarks, background, elevation
- Final code ported to datasets.xc.downloaded_page_metadata

In [None]:
from notebooks import *

In [None]:
import requests_html
from potoo.util import strip_startswith

In [None]:
page_path = xc.data_dir / 'SPTO/126661/page.html'
# page_path = xc.data_dir / 'SPTO/413790/page.html'

In [None]:
with open(page_path) as f:
    page = requests_html.HTML(
        url=page_path,
        html=f.read(),
    )

In [None]:
data = dict()
data['_raw'] = dict()

In [None]:
# Parse: xc_id, com_name, sci_name
title = page.find('meta[property="og:title"]', first=True)
if title:
    title = title.attrs.get('content')
data['_raw']['title'] = title
if not title:
    data['xc_id'] = None
    data['com_name'] = None
    data['sci_name'] = None
else:
    data.update(parse.parse('XC{xc_id} {com_name} ({sci_name})', title).named)
    data['xc_id'] = int(data['xc_id'])
# TODO assert data['xc_id'] == the_input_xc_id_we_loaded_the_page_file_for

In [None]:
# Parse: remarks, bird_seen, playback_used
#   - Ref: https://www.xeno-canto.org/upload/1/2
#   - Examples:
#       - '' [https://www.xeno-canto.org/420291]
#       - '\n\nbird-seen:no\n\nplayback-used:no' [https://www.xeno-canto.org/413790]
#       - 'About 20ft away in sagebrush steppe.\n\nbird-seen:yes\n\nplayback-used:no' [https://www.xeno-canto.org/418018]
description = page.find('meta[property="og:description"]', first=True)
if description:
    description = description.attrs.get('content')
data['_raw']['description'] = description
if not description:
    data['remarks'] = None
    data['bird_seen'] = None
    data['playback_used'] = None
else:
    lines = description.split('\n')
    keys = ['bird-seen', 'playback-used']
    for k in keys:
        data[k.replace('-', '_')] = or_else(None, lambda: first(
            parse.parse('%s:{}' % k, line)[0]
            for line in lines
            if line.startswith('%s:' % k)
        ))
    data['remarks'] = '\n'.join(
        line
        for line in lines
        if not any(
            line.startswith('%s:' % k)
            for k in keys
        )
    ).strip()

In [None]:
# Parse: all key-value pairs from #recording-data
#   - (Thanks XC for structuring this so well!)
recording_data = {
    k.lower().replace(' ', '_'): v
    for tr in page.find('#recording-data .key-value tr')
    for [k, v, *ignore] in [[td.text for td in tr.find('td')]]
}
data['_raw']['recording_data'] = recording_data
data.update(recording_data)

In [None]:
# Clean up fields
data['background'] = [
    x
    for x in data['background'].split('\n')
    for x in [x.strip()]
    if x != 'none'
]
data['latitude'] = or_else(None, lambda: float(data['latitude']))
data['longitude'] = or_else(None, lambda: float(data['longitude']))
data['elevation'] = or_else(None, lambda: parse.parse('{:g} m', data['elevation'])[0])
# data['sampling_rate'] = or_else(None, lambda: parse.parse('{:g} (Hz)', data['sampling_rate'])[0])
# data['bitrate_of_mp3'] = or_else(None, lambda: parse.parse('{:g} (bps)', data['bitrate_of_mp3'])[0])
# data['channels'] = or_else(None, lambda: parse.parse('{:g} (bps)', data['channels'])[0])

In [None]:
dict(data)

{
  '_raw': {
    'title': 'XC126661 Spotted Towhee (Pipilo maculatus)',
    'description':
      '2nd recording of this individual, which is No 1; an interior song type from an east slope Sierra "coastal" '
      'location (Olancha Pass Trailhead)\n\nbird-seen:yes\n\nplayback-used:no',
    'recording_data': {
      'recordist': 'Richard E. Webster',
      'date': '2000-06-17',
      'time': '05:00',
      'latitude': '36.2176',
      'longitude': '-118.04',
      'location': 'Sage Flats Road, Inyo, California',
      'country': 'United States',
      'elevation': '1800 m',
      'background': 'none',
      'length': '0-3(s)',
      'sampling_rate': '44100 (Hz)',
      'bitrate_of_mp3': '96000 (bps)',
      'channels': '1 (mono)',
      'type': 'song',
      'volume': 'decreasing',
      'speed': 'accelerating',
      'pitch': 'increasing',
      'number_of_notes': '>20',
      'variable': 'no'
    }
  },
  'background': [],
  'bird_seen': 'yes',
  'bitrate_of_mp3': '96000 (bps)',
  'c