In [18]:
import json
import logging
import typing
import urllib.parse
import httpx
import xarray

ISB_SERVER = "https://central.isample.xyz/isamples_central/"
TIMEOUT = 10 #seconds
USER_AGENT = "Python/3.11 isamples.examples"

logging.basicConfig(level=logging.INFO)
L = logging.getLogger()

class IsbClient:
    """A client for iSamples.
    """

    def __init__(self, isb_server:str=None):
        self.isb_server = ISB_SERVER if isb_server is None else isb_server
        self.isb_server = self.isb_server.strip(" /") + "/"
        self.session = httpx.Client()

    def _request(self, path:str, params=None)->typing.Any:
        headers = {
            "Accept": "application/json",
            "User-Agent": USER_AGENT
        }
        url = urllib.parse.urljoin(self.isb_server, path)
        response = self.session.get(url, params=params, headers=headers, timeout=TIMEOUT)
        L.info("url = %s", response.url)
        return response.json()

    def field_names(self)->typing.List[str]:
        """Return a list of field names available in the Solr endpoint.
        """
        response = self._request("thing/select/info")
        fields = [k for k in response.get("schema",{}).get("fields", {}).keys()]
        return fields

    def record_count(self, q:str)->int:
        """Number of records matching query q
        """
        params = httpx.QueryParams(rows=0, q=q)
        response = self._request("thing/select", params)
        return response.get("response", {}).get("numFound", -1)

    def facets(self, q:str, fields:typing.List[str]) -> typing.Dict[str, typing.Dict[str, int]]:
        """Get facet values and counts for the records matching query q and specified fields.

        Response is a dict of dicts:
        {
            field_name: {
                facet_value: count,
                ...
            },
            ...
        }
        """
        params = httpx.QueryParams(rows=0, q=q, facet="true")
        params = params.add("facet.mincount", 0)
        for field in fields:
            params = params.add("facet.field", field)
        response = self._request("thing/select", params)
        res = {}
        for field in fields:
            counts = {}
            vals = response.get("facet_counts",{}).get("facet_fields",{}).get(field, [])
            for i in range(0, len(vals), 2):
                k = vals[i]
                v = vals[i+1]
                counts[k] = v
            res[field] = counts
        return res


    def pivot(self, q:str, dimensions:typing.List[str])-> xarray.DataArray:
        """Return an n-dimensional xarray of counts for specified fields
        """

        def _normalize_facet(v:str):
            return v.strip().lower()

        def _get_coordinates(data, dimensions, coordinates):
            """Get the coordinate index values from the facet response.            
            """
            for entry in data:
                v = _normalize_facet(entry.get("value"))
                f = entry.get("field")
                if f is not None and v not in coordinates[f]:
                    coordinates[f].append(v)
                _get_coordinates(entry.get("pivot", []), dimensions, coordinates)

        def _value_structure(dimensions, coordinates, cdim=0):
            """Populate an empty value structure for holding the facet counts
            """
            nvalues = len(coordinates[dimensions[cdim]])
            if cdim >= len(dimensions)-1:
                return [0,]*nvalues
            return [_value_structure(dimensions, coordinates, cdim=cdim+1)]*nvalues

        def _set_values(values, data, coord):
            """Populate the xarray with the facet count values.
            """
            for entry in data:
                coord[entry.get("field")] = _normalize_facet(entry.get("value"))
                p = entry.get("pivot", None)
                if p is None:
                    values.loc[coord] = values.loc[coord]  + entry.get("count")
                else:
                    _set_values(values, p, coord)
                coord.popitem()

        if len(dimensions) < 2:
            raise ValueError("At least two dimensions required for pivot.")
        params = httpx.QueryParams(rows=0, q=q)
        params = params.add("facet", "true")
        params = params.add("facet.mincount", 0)
        params = params.add("facet.pivot", ",".join(dimensions))
        response = self._request("thing/select", params)
        fkey = ",".join(dimensions)
        data = response.get("facet_counts", {}).get("facet_pivot", {}).get(fkey, [])
        coordinates = {k:[] for k in dimensions}
        _get_coordinates(data, dimensions, coordinates)
        values = _value_structure(dimensions, coordinates)
        xd = xarray.DataArray(values, coords=coordinates, dims=dimensions)
        _set_values(xd, data, {})
        return xd


cli = IsbClient()
cli.record_count("*:*")

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A


6347967

In [19]:
fields = ["source", "hasMaterialCategory", "hasContextCategory"]
facets = cli.facets("*:*", fields)
print(json.dumps(facets, indent=2))

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.field=source&facet.field=hasMaterialCategory&facet.field=hasContextCategory "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.field=source&facet.field=hasMaterialCategory&facet.field=hasContextCategory


{
  "source": {
    "SESAR": 4688386,
    "OPENCONTEXT": 853229,
    "GEOME": 554320,
    "SMITHSONIAN": 213411
  },
  "hasMaterialCategory": {
    "Natural Solid Material": 2233939,
    "Organic material": 1108614,
    "Rock": 913127,
    " rock": 838805,
    " sediment": 838805,
    "Mixed soil": 838805,
    "Biogenic non organic material": 484858,
    "Material": 462472,
    "Mineral": 391088,
    "Biogenic non-organic material": 346242,
    "Anthropogenic metal": 184888,
    "Natural solid material": 182909,
    "Not Provided": 181260,
    "Anthropogenic material": 177576,
    "Sediment": 94084,
    "Soil": 37153,
    "Liquid water": 25777,
    "Gaseous material": 1225,
    "Particulate": 124,
    "Non-aqueous liquid material": 46,
    "Ice": 8
  },
  "hasContextCategory": {
    "Not Provided": 3984022,
    "Site of past human activities": 853229,
    "Earth interior": 665766,
    "Animalia": 391453,
    "Subaerial surface environment": 108123,
    "Marine water body": 56520,
    "

In [20]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.pivot=source%2ChasMaterialCategory%2ChasContextCategory "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.pivot=source%2ChasMaterialCategory%2ChasContextCategory


<xarray.DataArray ()>
array(4)
Coordinates:
    source               <U11 'geome'
    hasMaterialCategory  <U29 'organic material'
    hasContextCategory   <U29 'bacteria'


In [47]:
# Sum by axis 2 (hasContextCategory) and print
df = xd.sum(axis=2).to_pandas()
# display transposed
display(df.T)


source,sesar,opencontext,geome,smithsonian
hasMaterialCategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
natural solid material,2233939,182909,0,0
rock,1752520,272,0,0
sediment,932889,0,0,0
mixed soil,838805,0,0,0
material,462472,0,0,0
mineral,390797,291,0,0
biogenic non-organic material,346242,0,0,0
organic material,281834,59049,405855,213411
not provided,47173,134087,0,0
soil,37153,0,0,0


In [21]:
print(xd.loc["sesar", "rock"].sum())

<xarray.DataArray ()>
array(1752520)
Coordinates:
    source               <U11 'sesar'
    hasMaterialCategory  <U29 'rock'


In [22]:
# Field names in solr
for name in cli.field_names():
    print(name)

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select/info "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select/info


_nest_parent_
_nest_path_
_root_
_text_
_version_
authorizedBy
compliesWith
curation_accessContraints
curation_description
curation_description_text
curation_label
curation_location
curation_responsibility
description
description_text
hasContextCategory
hasContextCategoryConfidence
hasMaterialCategory
hasMaterialCategoryConfidence
hasSpecimenCategory
hasSpecimenCategoryConfidence
id
indexUpdatedTime
informalClassification
isb_core_id
keywords
label
producedBy_description
producedBy_description_text
producedBy_hasFeatureOfInterest
producedBy_isb_core_id
producedBy_label
producedBy_responsibility
producedBy_resultTime
producedBy_resultTimeRange
producedBy_samplingSite_description
producedBy_samplingSite_description_text
producedBy_samplingSite_label
producedBy_samplingSite_location_bb
producedBy_samplingSite_location_bb__maxX
producedBy_samplingSite_location_bb__maxY
producedBy_samplingSite_location_bb__minX
producedBy_samplingSite_location_bb__minY
producedBy_samplingSite_location_bb__x