forked from astropy/astroquery
-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_species_table.py
78 lines (59 loc) · 2.1 KB
/
build_species_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
Support module for splatalogue. Requires bs4, and is therefore not intended
for users / not part of the core package.
:author: Adam Ginsburg <adam.g.ginsburg@gmail.com>
"""
import json
import os
import requests
from astropy.config import paths
from . import conf
def data_path(filename: str):
"""
Build the path to save a file. Note that this path is part of the
astroquery source code, not the astropy cache directory, as the existence
of the file is a prerequisite for performing queries.
Parameters
----------
filename : str
Name of the file (generally should be splat-species.json)
Returns
-------
str
Full path to the cache directory
"""
data_dir = os.path.join(os.path.dirname(__file__), 'data')
return os.path.join(data_dir, filename)
def get_json_species_ids(outfile='splat-species.json', base_url=conf.base_url):
"""
Uses BeautifulSoup to scrape the NRAO Splatalogue species
selector form, and caches the result as JSON. The file
is saved to the ``astropy`` cache.
Parameters
----------
outfile : str, optional
Name of the output JSON, by default 'splat-species.json'
Returns
-------
str
Formatted string representation of the JSON object
"""
import bs4
result = requests.get(f'{base_url}/b.php')
page = bs4.BeautifulSoup(result.content, 'html5lib')
# The ID needs to be checked periodically if Splatalogue is updated
sid = page.findAll('select', attrs={'id': 'speciesselectbox'})[0]
species_types = set()
for kid in sid.children:
if hasattr(kid, 'attrs') and 'class' in kid.attrs:
species_types.add(kid['class'][0])
species = dict((k, {}) for k in species_types)
for kid in sid.children:
if hasattr(kid, 'attrs') and 'class' in kid.attrs:
species[kid['class'][0]][kid['value']] = kid.text
with open(data_path(outfile), 'w') as f:
json.dump(species, f)
return json.dumps(species)
if __name__ == "__main__":
get_json_species_ids()