In [4]:
# Download HITRAN24 data from web #

from pathlib import Path
import zipfile
from archnemesis.database.utils import fetch
from archnemesis.database.readers.hitran.isotopologues import download_hitran_isotope_data


HITRAN_DOWNLOAD_URL = "https://saco.csic.es/s/9y7LW4craTJARWR/download?path=%2F&files=HITRAN24&downloadStartSecret=anything_can_go_here"

test_data_dir = Path("./test_data")
hitran_zipfile_path = test_data_dir / "HITRAN24.zip"
hitran_archive_extract_path = test_data_dir
hitran_archive_path = test_data_dir / "HITRAN24"
ans_database_file = test_data_dir / "hitran24.h5"

test_data_dir.mkdir(parents=True, exist_ok=True)

if not hitran_zipfile_path.exists():
	hitran_databse = fetch.file(HITRAN_DOWNLOAD_URL, to_fpath = hitran_zipfile_path, chunk_size=10*1024*1024)

hitran_zipfile = zipfile.ZipFile(hitran_zipfile_path).extractall(hitran_archive_extract_path)



download_hitran_isotope_data(test_data_dir)

# A bit of fiddling to load the isotope data we just downloaded
import importlib
import sys

MODULE_PATH = test_data_dir / "isotope_data.py"
MODULE_NAME = "hitran_isotope_data"
spec = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
hitran_isotope_data = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = hitran_isotope_data 
spec.loader.exec_module(hitran_isotope_data)

hitran_isotopes = hitran_isotope_data.HitranIsotope.dict()



INFO :: file_in_chunks :: fetch.py-45 :: url='https://hitran.org/docs/iso-meta/'
INFO :: file_in_chunks :: fetch.py-100 :: Fetching chunk 0. Chunk is 83.0009765625 Kb. Fetched 0.0 Kb so far...
INFO :: file_in_chunks :: fetch.py-107 :: Fetch complete, downloaded 83.0009765625 Kb in total over 1 chunks.


MOL <h4>1: H<sub>2</sub>O</h4>
h4.contents=['1: H', <sub>2</sub>, 'O']
html_content_or_tag=['H', <sub>2</sub>, 'O']
DEBUG : type(item)=<class 'str'> item='H'
DEBUG : type(item)=<class 'bs4.element.Tag'> item=<sub>2</sub>
DEBUG : type(item)=<class 'bs4.element.NavigableString'> item='O'
## ISO ##
tag.text='global ID'
k='global id' tag.text.lower()[:len(k)]='global id'
tag.text='local ID'
k='local id' tag.text.lower()[:len(k)]='local id'
tag.text='Formula'
k='formula' tag.text.lower()[:len(k)]='formula'
tag.text='AFGL code'
k='afgl code' tag.text.lower()[:len(k)]='afgl code'
tag.text='Abundance'
k='abundance' tag.text.lower()[:len(k)]='abundance'
tag.text='Molar Mass /g·mol-1'
k='molar mass' tag.text.lower()[:len(k)]='molar mass'
tag.text='Q(296\xa0K)'
k='q(296' tag.text.lower()[:len(k)]='q(296'
tag.text='Q (full range)'
k='q (full' tag.text.lower()[:len(k)]='q (full'
tag.text='gi'
k='gi' tag.text.lower()[:len(k)]='gi'
{0: 'global_id', 1: 'iso_id', 2: 'isotopologue', 3: 'AFGL', 4: 'terre

In [5]:
# Read data from HITRAN archive #

# NOTE: Reading the data with HAPI uses up a lot of memory, I nearly max-out my 16 GB of RAM and 16 GB swap.

import hapi
import numpy as np
import h5py

from archnemesis.database.datatypes.hitran.gas_descriptor import HitranGasDescriptor
from archnemesis.database.datatypes.gas_descriptor import RadtranGasDescriptor



# NOTE: if we have already written an HDF5 file, use that instead as it is faster and uses less memory

FORCE_READ_FROM_DATABASE = False

def get_isotope_abundances(mol_id, ISO):
	iso_ids = sorted(iso for (m, iso) in ISO if m == mol_id)
	return [ISO[(mol_id, iso)][2] for iso in iso_ids]

if FORCE_READ_FROM_DATABASE or (not ans_database_file.exists()):
	table_name = 'hitran24'

	#Initialise the database
	hapi.db_begin(str(hitran_archive_path))


	#Reading all the data
	(
		mol_id,
		local_iso_id,
		nu,
		sw,
		a,
		gamma_air,
		gamma_self,
		elower,
		n_air,
		delta_air,
		gp,
		gpp
	) = hapi.getColumns(
		table_name,
		[
			'molec_id',
			'local_iso_id',
			'nu',
			'sw',
			'a',
			'gamma_air',
			'gamma_self',
			'elower',
			'n_air',
			'delta_air',
			'gp',
			'gpp'
		]
	)

	#Identifying unique species
	molec_id_uniq = np.unique(mol_id)
	print(f'{molec_id_uniq=}')

	rt_gas_descs = []
	rt_gas_desc = None
	mask = np.ones_like(mol_id, dtype=bool)
	local_iso_id_radtran = -1*np.ones_like(mol_id, dtype=int)
	mol_id_radtran= -1*np.ones_like(mol_id, dtype=int)

	# Adjust HITRAN data
	for mol_hitran in molec_id_uniq:
		print(f'Adjusting HITRAN data for {mol_hitran=}')
		
		#Getting the isotopic abundances 
		#isotopic_abundances = get_isotope_abundances(mol_hitran, hapi.ISO)

		#Identifying the isotopes in the database
		iso_id_uniq = np.unique(local_iso_id[mol_id==mol_hitran])

		#Removing the isotopic abundance effect in the line strengths
		
		
		for iso in iso_id_uniq:
			mask[...] = ((mol_id == mol_hitran) & (local_iso_id == iso))
			print(f'{iso=} {np.count_nonzero(mask)=}')
			
			#sw[mask] = sw[mask] / isotopic_abundances[iso-1]
			
			hitran_isotope = hitran_isotopes[(mol_hitran, iso)]
			sw[mask] = sw[mask] / hitran_isotope.abundance
			rt_gas_desc = HitranGasDescriptor(mol_hitran, iso, hitran_isotope.global_id).to_radtran()
			print(f'{rt_gas_desc=}')
			
			rt_gas_descs.append(rt_gas_desc)
			
			local_iso_id_radtran[mask] = rt_gas_desc.iso_id
			mol_id_radtran[mask] = rt_gas_desc.gas_id


	print(f'{len(rt_gas_descs)=}')
	print(f'{mol_id_radtran.shape=}')
	print(f'{np.count_nonzero(mol_id_radtran<0)=}')

else:
	print(f'Loading from {ans_database_file}')
	mol_id_radtran_list = []
	local_iso_id_radtran_list = []
	nu_list = []
	sw_list = []
	a_list = []
	elower_list = []
	gamma_self_list = []
	gamma_air_list = []
	n_air_list = []
	delta_air_list = []

	with h5py.File(ans_database_file, 'r') as f:
		ld_grp = f['sources/HITRAN24/line_data']
		for mol_grp in ld_grp.values():
			for iso_grp in mol_grp.values():
				mol_id_radtran_list.append(iso_grp['mol_id'][tuple()])
				local_iso_id_radtran_list.append(iso_grp['local_iso_id'][tuple()])
				nu_list.append(iso_grp['nu'][tuple()])
				sw_list.append(iso_grp['sw'][tuple()])
				a_list.append(iso_grp['a'][tuple()])
				elower_list.append(iso_grp['elower'][tuple()])
				gamma_self_list.append(iso_grp['gamma_self'][tuple()])
				
				if 'broadeners' in iso_grp:
					b_grp = iso_grp['broadeners']
					if 'AIR' in b_grp:
						amb_grp = b_grp['AIR']
						gamma_air_list.append(amb_grp['gamma_amb'][tuple()])
						n_air_list.append(amb_grp['n_amb'][tuple()])
						delta_air_list.append(amb_grp['delta_amb'][tuple()])
	
	mol_id_radtran =np.concatenate(mol_id_radtran_list)
	local_iso_id_radtran =np.concatenate(local_iso_id_radtran_list)
	nu =np.concatenate(nu_list)
	sw =np.concatenate(sw_list)
	a =np.concatenate(a_list)
	elower =np.concatenate(elower_list)
	gamma_self =np.concatenate(gamma_self_list)
	gamma_air =np.concatenate(gamma_air_list)
	n_air =np.concatenate(n_air_list)
	delta_air =np.concatenate(delta_air_list)








Loading from test_data/hitran24.h5


## ARCHNEMESIS HDF5 "AnsDatabase" File Format ##

As HDF5 files are very adaptable, the AnsDatabase format is just some conventions that certain groups should follow. There are three top-level groups `line_data`, `partition_function`, and `sources` that follow the specifications below.

HDF5 files have "Virtual Datasets", these are datasets that can be built from other datasets in the same file or in different files. These are used to enable the `line_data` and `partition_function` groups to refer to data within the `sources` group and even to data in separate files. If a dataset in the top-level `line_data` or `partition_function` groups is not virtual, then it will not be overwritten when updating virtual datasets.

NOTE: the HDF5 extension in VSCode does not deal with virtual datasets that reference separate files correctly. However, the example `ex_03_ans_hdf5_file_external_sources_example.ipynb` uses an AnsDatabase file that only contains virtual datasets that reference external files and it works fine.

NOTE: Groups with a variable name are denoted as `<variable_name>`.

### The `line_data` Group ###

Must contain sub-groups and datasets in the following structure, note that ordering is not important:

* Group: `<mol_name>`
  * Group: `<iso_id>`
    * Dataset: `mol_id`
    * Dataset: `local_iso_id`
    * Dataset: `nu`
    * Dataset: `sw`
    * Dataset: `a`
    * Dataset: `elower`
    * Dataset: `gamma_self`
    * Dataset: `n_self`
    * Group: `broadeners`
      * Group: `<ambient_gas>`
        * Dataset: `gamma_amb`
        * Dataset: `n_amb`
        * Dataset: `delta_amb`

All datasets must have the same number of entries in the first dimension, and currently all datasets are 1-dimensional. Values are associated by index, (i.e., `nu[0]` is the wavenumber corresponding to the line strength in `sw[0]`). It is the job of the person who creates the file to ensure that there is no missing data, and all data entries line up correctly.

#### item descriptions ####

* `<mol_name>` - String of molecule name, one group for each molecule
* `<iso_id>` - String of radtran isotopologue ID number, one group for each isotopologue of a molecule
* `mol_id` - Radtran molecule ID number
* `local_iso_id` - Radtran isotopologue ID number (relative to molecule ID number)
* `nu` - Wavenumber of line
* `sw` - Spectral line intensity at T=296 Kelvin
* `a` - Einstein A-coefficient
* `elower` - Lower state energy of transition
* `gamma_self` - Self-broadened HWHM at T=296 Kelvin
* `n_self` - Temperature exponent for `gamma_self`
* `broadeners` - Literal string "broadeners" that contains sub-groups for each ambient broadening gas
* `<ambient_gas>` - String of ambient gas, one group for each broadening gas for the isotopologue.
* `gamma_amb` - Ambient gas broadened HWHM at T=296 kelvin
* `n_amb` - Temperature exponent for `gamma_amb`
* `delta_amb` - Pressure shift induced by ambient gas, referred to p=1 atm


### The `partition_function` Group ###

Must contain sub-groups and datasets in the following structure, note that ordering is not important:

* Group: `<mol_name>`
  * Group: `<iso_id>`
    * Group: `pf_data_<pf_data_index>`
      * Dataset: `domain`
      * Dataset: `partition_function_type`
      * **Type Dependent Datasets Explained Below**


The different `patition_function_type` values are `TabulatedPFData`, `PolynomialPFData`. The structure of each type is:

* `...`
  * `...`
    * Group: `pf_data_<pf_data_index>`
      * Dataset: `domain`
      * `patition_function_type` = `TabulatedPFData`
      * Dataset: `q`
      * Dataset: `temp`

* `...`
  * `...`
    * Group: `pf_data_<pf_data_index>`
      * Dataset: `domain`
      * `patition_function_type` = `PolynomialPFData`
      * Dataset: `coeffs`

#### item descriptions ####

* `<mol_name>` - String of molecule name, one group for each molecule
* `<iso_id>` - String of radtran isotopologue ID number, one group for each isotopologue of a molecule
* `pf_data_<pf_data_index>` - String that starts with "pf_data_" and ends with an integer `<pf_data_index>` that exists to separate different partition function datasets for each isotopologue. A given partition function dataset may overlap with other partition function datasets.
* `domain` - Two floats that denote the temperature range (in Kelvin) that the partition function dataset is valid for, a `TabulatedPFData` dataset will have a domain equal to its range of temperatures, a `PolynomialPFData` dataset will have a domain of (0, inf) by default.
* `partition_function_type` - A string that denotes what other datasets are in the group. Has possible values `TabulatedPFData` and `PolynomialPFData`

* `TabulatedPFData` - Partition function is described as a table of temperature vs partition function value.
  * `q` - Partition function value
  * `temp` - Temperature in Kelvin

* `PolynomialPFData` - Partition function is described by a polynomial with temperature (in Kelvin) as the variable.
  * `coeffs` - The coefficients (in order of increasing powers) of the polynomial.


### The `sources` Group ###

Contains sub-groups for each source of data that are combined together to create the top-level `line_data` and `partition_function` groups. The structure is:

* Group: `<source_name>`
  * Group or Dataset: `line_data`
  * Group or Dataset: `partition_function`

The `line_data` and/or `partition_function` items can either be **Groups** or **Datasets**. 

If they are **Groups** they should follow the same format as the top-level group of the same name (they can even contain virtual datasets, but nesting virtual datasets too much can be confusing). 

If they are **Datasets** they always refer to a group in an external file (which should have the same format as the top-level group of the same name), the dataset can either be a **scalar string**, or a **1-dimensional array** containing 2 strings. If a **scalar string**, the string is the name of an external file and the group referred to has the same name as the dataset. If a **1-dimensional array**, the first string is the name of an external file and the second string is the name of the group within that file.

All referred to groups should have the same structure as the top-level group of the same name as the dataset.

#### item descriptions ####

* `<source_name>` - Name of the source that the data within this group comes from. Use attributes to add more information as needed (e.g. references etc.)
* `line_data` - Either a group that contains line data from the named source, or a dataset that points to a group in an external file that contains line data from the named source.
* `partition_function` - Either a group that contains partition function data from the named source, or a dataset that points to a group in an external file that contains partiton function data from the named source.

In [6]:
# Add HITRAN24 data to HDF5 file #

from archnemesis.enums import AmbientGas

from archnemesis.database.data_holders.line_broadener_holder import LineBroadenerHolder
from archnemesis.database.data_holders.line_data_holder import LineDataHolder
from archnemesis.database.data_holders.partition_function_data_holder import PartitionFunctionDataHolder
from archnemesis.database.datatypes.pf_data.tabulated_pf_data import TabulatedPFData
from archnemesis.database.datatypes.pf_data.polynomial_pf_data import PolynomialPFData

from archnemesis.database.filetypes.ans_line_data_file import AnsLineDataFile
from archnemesis.database.filetypes.ans_partition_fn_data_file import AnsPartitionFunctionDataFile


USE_POLYNOMIAL_PARTITION_DATA = False

mol_list = []
iso_list = []

unique_mol_iso_ids = np.unique(np.stack((mol_id_radtran, local_iso_id_radtran),axis=1), axis=0)

# Create variable to hold partition function data
pfdh = PartitionFunctionDataHolder(
	'HITRAN24',
	'Data in this group is taken from the HITRAN24 database',
)

# Loop over isotopologues and add their partition function data to the PartitionFunctionDataHolder instance
for mol_id_rt, iso_id_rt in unique_mol_iso_ids:
	gas_desc = RadtranGasDescriptor(mol_id_rt, iso_id_rt)
	try:
		ht_gas_desc = HitranGasDescriptor.from_radtran(gas_desc)
	except KeyError:
		continue
	
	try:
		temp = hapi.TIPS_2021_ISOT_HASH[(ht_gas_desc.gas_id,ht_gas_desc.iso_id)]
		q = hapi.TIPS_2021_ISOQ_HASH[(ht_gas_desc.gas_id,ht_gas_desc.iso_id)]
	except KeyError:
		continue
	
	mol_list.append(mol_id_rt)
	iso_list.append(iso_id_rt)
	
	if USE_POLYNOMIAL_PARTITION_DATA:
		pfdh.add(
			mol_id_rt,
			iso_id_rt,
			PolynomialPFData(
				*TabulatedPFData(
					temp,
					q
				).as_poly()
			)
		)
	else:
		pfdh.add(
			mol_id_rt,
			iso_id_rt,
			TabulatedPFData(
				temp,
				q
			)
		)


# Get an HDF5 file to save the parttion function data in
hitran_pf_data_file = AnsPartitionFunctionDataFile(ans_database_file)

# Add the data in `pdfh` to the HDF5 file, this makes virtual datasets in the `partition_function` top-level group that refer to the source
hitran_pf_data_file.set_source(pfdh)


# Create a variable that holds the line data
hitran_line_data_holder = LineDataHolder(
	"HITRAN24",
	'Data in this group is taken from the HITRAN24 database',
	mol_id_radtran,
	local_iso_id_radtran,
	nu,
	sw,
	a,
	elower,
	gamma_self,
	np.zeros_like(gamma_self),
	broadeners = [
		LineBroadenerHolder(
			AmbientGas.AIR.name,
			gamma_air,
			n_air,
			delta_air,
		),
	],
)

# Get an HDF5 file to save the parttion function data in
hitran_line_data_file = AnsLineDataFile(ans_database_file)

# Add the data in `hitran_line_data_holder` to the HDF5 file, this makes virtual datasets in the `line_data` top-level group that refer to the source
hitran_line_data_file.set_source(hitran_line_data_holder)



INFO :: validate_partition_function_group :: ans_partition_fn_data_file.py-148 :: Validation for "/partition_function" in "test_data/hitran24.h5" succeeded


DEBUG : u_ids.shape=(2, 148)


INFO :: validate_line_data_group :: ans_line_data_file.py-121 :: Validation for "/sources/HITRAN24/line_data" in "test_data/hitran24.h5" succeeded
INFO :: validate_line_data_group :: ans_line_data_file.py-121 :: Validation for "/line_data" in "test_data/hitran24.h5" succeeded
