In [1]:
import json
import logging
import typing
import urllib.parse
import httpx
import xarray

from urllib.parse import quote

import pandas as pd
from pandas import DataFrame, Series
import numpy as np


from collections import Counter
from isbclient import IsbClient


# The overall iSamples API

*  https://central.isample.xyz/isamples_central/ui is the swagger UI
* https://central.isample.xyz/isamples_central/openapi.json is the swagger file


In [2]:

OPENAPI_URL = 'https://central.isample.xyz/isamples_central/openapi.json'
r = httpx.get(OPENAPI_URL)
r.json()['paths'].keys()

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


dict_keys(['/metrics', '/metrics/', '/thing', '/thing/', '/thing/types', '/thing/select', '/thing/reliquery', '/thing/stream', '/thing/select/info', '/h3_counts/', '/things', '/thing/{identifier}', '/resolve/{identifier}', '/stac_item/{identifier}', '/stac_collection/{filename}', '/things_geojson_heatmap', '/things_leaflet_heatmap', '/related', '/related/'])

# /thing/select: Solr-based select interface

In [3]:
# focus on /thing/select endpoint
r = httpx.get(OPENAPI_URL)
r.json()['paths']['/thing/select']['get']

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


{'summary': 'Get Solr Select',
 'description': 'Send select request to the Solr isb_core_records collection.\n\nSee https://solr.apache.org/guide/8_11/common-query-parameters.html',
 'operationId': 'get_solr_select_thing_select_get',
 'responses': {'200': {'description': 'Successful Response',
   'content': {'application/json': {'schema': {'title': 'Response Get Solr Select Thing Select Get'}}}}}}

In [4]:
# fields used in https://central.isample.xyz/isamples_central/ui

MAJOR_FIELDS = dict([('All text fields', 'searchText'),
 ('Collection Date', 'producedBy_resultTimeRange'),
 ('Context', 'hasContextCategory'),
 ('Identifier', 'id'),
 ('Keywords', 'keywords'),
 ('Label', 'label'),
 ('Material', 'hasMaterialCategory'),
 ('ProducedBy ResultTime',  'producedBy_resultTime'),
 ('ProducedBy SamplingSite PlaceName', 'producedBy_samplingSite_placeName'),
 ('Registrant', 'registrant'),
 ('Source', 'source'),
 ('Source Updated Time', 'sourceUpdatedTime'),
 ('Spatial Query', 'producedBy_samplingSite_location_rpt'),
 ('Specimen', 'hasSpecimenCategory')])

# default field list to return

FL_DEFAULT = ('searchText',
 'authorizedBy',
 'producedBy_resultTimeRange',
 'hasContextCategory',
 'curation_accessContraints',
 'curation_description_text',
 'curation_label',
 'curation_location',
 'curation_responsibility',
 'description_text',
 'id',
 'informalClassification',
 'keywords',
 'label',
 'hasMaterialCategory',
 'producedBy_description_text',
 'producedBy_hasFeatureOfInterest',
 'producedBy_label',
 'producedBy_responsibility',
 'producedBy_resultTime',
 'producedBy_samplingSite_description_text',
 'producedBy_samplingSite_label',
 'producedBy_samplingSite_location_elevationInMeters',
 'producedBy_samplingSite_location_latitude',
 'producedBy_samplingSite_location_longitude',
 'producedBy_samplingSite_placeName',
 'registrant',
 'samplingPurpose',
 'source',
 'sourceUpdatedTime',
 'producedBy_samplingSite_location_rpt',
 'hasSpecimenCategory')

FACET_FIELDS_DEFAULT = ('authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory')

# https://solr.apache.org/guide/8_11/faceting.html#range-faceting

FACET_RANGE_FIELDS_DEFAULT = {
    'facet.range': 'producedBy_resultTimeRange',
    'f.producedBy_resultTimeRange.facet.range.gap': '+1YEARS',
    'f.producedBy_resultTimeRange.facet.range.start': '1800-01-01T00:00:00Z',
    'f.producedBy_resultTimeRange.facet.range.end': '2023-01-01T00:00:00Z',
}


In [5]:
from datetime import datetime

def format_date_for_solr(date_str):
    # Assuming the input is in a format like 'YYYY-MM-DD' or already in ISO 8601
    # Modify this part if your input format is different
    try:
        # If the date is already in ISO 8601 format, return as is
        datetime.fromisoformat(date_str)
        return date_str
    except ValueError:
        # Convert from 'YYYY-MM-DD' to ISO 8601
        return datetime.strptime(date_str, '%Y-%m-%d').isoformat() + 'Z'

def create_date_range_query(start_str, end_str):
    # If start_str or end_str is blank, use '*' for open-ended range
    start_date = format_date_for_solr(start_str) if start_str else '*'
    end_date = format_date_for_solr(end_str) if end_str else '*'
    return f'[{start_date} TO {end_date}]'

def filter_null_values(d):
    return {k:v for k,v in d.items() if v is not None}

ISAMPLES_SOURCES = ['SESAR',
    'OPENCONTEXT',
    'GEOME',
    'SMITHSONIAN',
]

params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}

class IsbClient2(IsbClient):
    def default_search(self, q='*:*',
                       fl = FL_DEFAULT, 
                        start=0, rows=20, collection_date_start=1800, collection_date_end='NOW', source=None,
                       facet_field = FACET_FIELDS_DEFAULT,
                       **kwargs):
        
        # build fq
        # 'field1': quote('value with spaces and special characters like &'),

        # source is a tuple drawing from ['SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN']
        if source is not None:
            source = " or ".join([f'"{s}"' for s in source])

        filter_conditions = {
            
            'producedBy_resultTimeRange': f'[{collection_date_start} TO {collection_date_end}]',  # Range query
            'source': source,  # Boolean logic
            '-relation_target':'*'
        }

        # Convert to list of fq strings
        fq = [f'{field}:{value}' for field, value in filter_null_values(filter_conditions).items()]

        # fq = ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*']

        params={
            'q': q, 
            'fl': fl,
            'start':start, 
            'rows': rows, 
            'fq': fq,
            'facet': 'on',
            'facet.field': facet_field,
        }

        # update params with kwargs
        params.update(kwargs)
            
        return self._request("thing/select", params=params)



In [6]:
# can we plugin pysolr here?
import pysolr

def my_select(self, params, handler=None):
    """
    :param params:
    :param handler: defaults to self.search_handler (fallback to 'select')
    :return:
    """
    # Returns json docs unless otherwise specified
    params.setdefault("wt", "json")
    custom_handler = handler or self.search_handler
    handler = "select"
    if custom_handler:
        if self.use_qt_param:
            params["qt"] = custom_handler
        else:
            handler = custom_handler

    params_encoded = pysolr.safe_urlencode(params, True)

    if len(params_encoded) < 1024:
        # Typical case.
        path = "%s?%s" % (handler, params_encoded)
        return self._send_request("get", path)
    else:
        # Handles very long queries by submitting as a POST.
        path = "%s" % handler
        headers = {
            "Content-type": "application/x-www-form-urlencoded; charset=utf-8"
        }
        return self._send_request(
            "post", path, body=params_encoded, headers=headers
        )

pysolr.Solr._select = my_select


In [7]:

solr = pysolr.Solr('https://central.isample.xyz/isamples_central/thing')


In [8]:
from itertools import islice

for (i, doc) in enumerate(islice(solr.search(q='*:*', fl=FL_DEFAULT, sort='id ASC',cursorMark='*'), 30)):
    print(i, doc)

INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&sort=id+ASC&cursorMark=%2A&wt=json' (get) with body '' in 2.843 seconds, with st

0 {'id': 'IGSN:000000001', 'sourceUpdatedTime': '2009-10-09T04:03:11Z', 'label': 'india', 'searchText': ['india', 'Terrestrial Section', 'Not Provided', 'Bheemashankar Kodge', 'Bheemashankar Kodge,,Sample Owner', 'Delta', 'SESAR'], 'hasContextCategory': ['Earth interior'], 'hasMaterialCategory': ['Mineral'], 'hasSpecimenCategory': ['Not Provided'], 'keywords': ['Terrestrial Section'], 'informalClassification': ['Not Provided'], 'registrant': ['Bheemashankar Kodge'], 'producedBy_responsibility': ['Bheemashankar Kodge,,Sample Owner'], 'producedBy_hasFeatureOfInterest': 'Delta', 'producedBy_resultTime': '2009-10-09T04:03:11Z', 'producedBy_resultTimeRange': '2009-10-09T04:03:11Z', 'producedBy_samplingSite_location_elevationInMeters': 300.0, 'producedBy_samplingSite_location_rpt': 'POINT (76 18)', 'producedBy_samplingSite_location_latitude': 18.0, 'producedBy_samplingSite_location_longitude': 76.0, 'source': 'SESAR'}
1 {'id': 'IGSN:001000001', 'sourceUpdatedTime': '2012-08-20T11:35:26Z', 'l

INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&sort=id+ASC&cursorMark=AoEuSUdTTjowMDEwMDAwMDk%3D&wt=json' (get) with body '' in

10 {'id': 'IGSN:001000010', 'sourceUpdatedTime': '2012-08-20T11:35:27Z', 'label': 'VM29-164PC-605-608cm', 'searchText': ['VM29-164PC-605-608cm', 'Core Piece', 'Not Provided', 'Nichole Anest', 'Coring>PistonCorer', 'Coring>PistonCorer', 'Nichole Anest,,Sample Owner', 'SESAR'], 'hasContextCategory': ['Not Provided'], 'hasMaterialCategory': ['Sediment'], 'hasSpecimenCategory': ['Other solid object'], 'keywords': ['Core Piece'], 'informalClassification': ['Not Provided'], 'registrant': ['Nichole Anest'], 'producedBy_label': 'Coring>PistonCorer', 'producedBy_description_text': 'Coring>PistonCorer', 'producedBy_responsibility': ['Nichole Anest,,Sample Owner'], 'producedBy_resultTime': '2012-08-01T00:00:00Z', 'producedBy_resultTimeRange': '2012-08-01T00:00:00Z', 'source': 'SESAR'}
11 {'id': 'IGSN:001000011', 'sourceUpdatedTime': '2012-08-20T11:35:27Z', 'label': 'VM29-164PC-608-611cm', 'searchText': ['VM29-164PC-608-611cm', 'Core Piece', 'Not Provided', 'Nichole Anest', 'Coring>PistonCorer', '

INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&sort=id+ASC&cursorMark=AoEuSUdTTjowMDEwMDAwMTk%3D&wt=json' (get) with body '' in

20 {'id': 'IGSN:00100001K', 'sourceUpdatedTime': '2012-08-20T11:35:27Z', 'label': 'VM29-164PC-665-668cm', 'searchText': ['VM29-164PC-665-668cm', 'Core Piece', 'Not Provided', 'Nichole Anest', 'Coring>PistonCorer', 'Coring>PistonCorer', 'Nichole Anest,,Sample Owner', 'SESAR'], 'hasContextCategory': ['Not Provided'], 'hasMaterialCategory': ['Sediment'], 'hasSpecimenCategory': ['Other solid object'], 'keywords': ['Core Piece'], 'informalClassification': ['Not Provided'], 'registrant': ['Nichole Anest'], 'producedBy_label': 'Coring>PistonCorer', 'producedBy_description_text': 'Coring>PistonCorer', 'producedBy_responsibility': ['Nichole Anest,,Sample Owner'], 'producedBy_resultTime': '2012-08-01T00:00:00Z', 'producedBy_resultTimeRange': '2012-08-01T00:00:00Z', 'source': 'SESAR'}
21 {'id': 'IGSN:00100001L', 'sourceUpdatedTime': '2012-08-20T11:35:27Z', 'label': 'VM29-164PC-668-671cm', 'searchText': ['VM29-164PC-668-671cm', 'Core Piece', 'Not Provided', 'Nichole Anest', 'Coring>PistonCorer', '

In [None]:
import pandas as pd
from pandas import DataFrame, Series

df = DataFrame(islice(solr.search(q='*:*', fl=FL_DEFAULT, sort='id ASC',cursorMark='*'), 30))
df.head()

In [None]:

cli = IsbClient2()
cli.record_count("*:*")


In [None]:
r = cli.default_search(source=('SESAR', 'OPENCONTEXT'), **FACET_RANGE_FIELDS_DEFAULT)

In [None]:
# keys: dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
r['facet_counts']['facet_ranges'].keys()

In [None]:
# 'responseHeader', 'index', 'schema', 'info'
r = cli._request("thing/select/info")
r.keys()

In [None]:
r['schema']['fields'].keys()

In [None]:
# timeout internal server error
if False:
    r = cli._request("thing/types")

In [None]:
# types and classnames for all the fields on the system
Counter([(x['type'], r['schema']['types'][x['type']]['className']) for x in r['schema']['fields'].values()])

In [None]:
# e.g, I for Indexed, T for Tokenized, S for Stored, etc.
r['info']['key']

# ['fields', 'dynamicFields', 'uniqueKeyField', 'similarity', 'types']
r['schema'].keys()

# get the fields -- 78 of them
print ("number of fields", len(r['schema']['fields'].keys()))

field_names = cli.field_names()
print("number of field names (another way to access)", len(field_names))

print ("types for the major fields")
[(k,v['type'], r['schema']['types'][v['type']]['className'] ) for (k,v) in r['schema']['fields'].items() if k in MAJOR_FIELDS.values()]

In [None]:
from urllib.parse import urlparse, parse_qs

url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&fl=searchText%20authorizedBy%20producedBy_resultTimeRange%20hasContextCategory%20curation_accessContraints%20curation_description_text%20curation_label%20curation_location%20curation_responsibility%20description_text%20id%20informalClassification%20keywords%20label%20hasMaterialCategory%20producedBy_description_text%20producedBy_hasFeatureOfInterest%20producedBy_label%20producedBy_responsibility%20producedBy_resultTime%20producedBy_samplingSite_description_text%20producedBy_samplingSite_label%20producedBy_samplingSite_location_elevationInMeters%20producedBy_samplingSite_location_latitude%20producedBy_samplingSite_location_longitude%20producedBy_samplingSite_placeName%20registrant%20samplingPurpose%20source%20sourceUpdatedTime%20producedBy_samplingSite_location_rpt%20hasSpecimenCategory&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A(%22OPENCONTEXT%22%20OR%20%22SESAR%22)&fq=-relation_target%3A*&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet.range=producedBy_resultTimeRange&facet.range.gap=%2B1YEARS&facet.range.start=1800-01-01T00:00:00Z&facet.range.end=2023-01-01T00:00:00Z&f.registrant.facet.sort=count&f.source.facet.sort=index&rows=20&facet.limit=-1&facet.sort=index&&start=0&facet=on&wt=json'

parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# The result is a dictionary where each key is associated with a list of values.
# You can iterate over this dictionary to process your parameters as needed.
for key, values in query_params.items():
    print(f"{key}: {values}")

# If you need each key to have a single value (taking the first value if multiple are present),
# you can do the following:
single_value_params = {key: values[0] for key, values in query_params.items()}
print(single_value_params)


In [None]:
# simplest query -- default

cli._request("thing/select", params={'q': '*:*', 'start':0, 'rows': 10, 
        'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*'],
        'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
        'facet': 'on',
        })



Let's break down these parameters, which are used for querying a Solr search engine. Solr is an open-source search platform that provides a wide range of capabilities for text search and faceted search, among other features.

q: This parameter specifies the query. Here, *:* is a wildcard query, meaning it matches all documents in the Solr index.


[fl](https://solr.apache.org/guide/8_11/common-query-parameters.html#fl-field-list-parameter): This stands for "field list". It specifies the fields to return in the result. In your query, a long list of fields like searchText, authorizedBy, producedBy_resultTimeRange, etc., are included. Only these fields will be returned for each document in the search results.

fq: This is the "filter query". It filters the results returned by the main query (q) without influencing the score. Here, there are three filters applied:

> producedBy_resultTimeRange:[1800 TO 2023] filters documents to those produced between the years 1800 and 2023.
source:(OPENCONTEXT) filters documents where the source field matches "OPENCONTEXT".
-relation_target:* excludes documents where the relation_target field exists.
facet.field: Faceting is used to aggregate data based on a field. This parameter specifies the fields for which you want to see facet counts. Facets on fields like authorizedBy, hasContextCategory, etc., are requested.


facet.range, facet.range.gap, facet.range.start, and facet.range.end: These parameters are used for range faceting. You are faceting on the producedBy_resultTimeRange field, starting from "1800-01-01T00:00:00Z" to "2023-01-01T00:00:00Z", with a gap of "+1YEARS". This means it will provide counts for each year in this range.

f.registrant.facet.sort and f.source.facet.sort: These are sorting instructions for the facets. The registrant facet is sorted by count, and the source facet is sorted by index.

rows: This specifies the number of documents to return. In your query, it's set to 20.

facet.limit: This limits the number of facet values returned for each facet field. -1 means no limit.

facet.sort: It dictates how to sort the facet fields. Here, it's sorted by index.

start: This is the offset in the complete result set for pagination. It tells Solr where to start in the list of results (useful for paging through results).

facet: When set to 'on', it enables faceting.

wt: This stands for "writer type" and specifies the output format. Here, 'json' indicates that the response should be in JSON format.

In [None]:
import httpx


url = "https://central.isample.xyz/isamples_central/thing/select"
params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}
headers = {
    'Accept': 'application/json',
    'User-Agent': 'raymondyee.net'
}

# keys in response: 'responseHeader', 'response', 'facet_counts'
response = httpx.get(url, params=params, headers=headers)


In [None]:
# get back parameters that went into the query and some basic metadata
response.json()['responseHeader']

In [None]:
# 'numFound', 'start', 'numFoundExact', 'docs'
response.json()['response'].keys()

(response.json()['response']['numFound'], response.json()['response']['numFoundExact'])


In [None]:
response.json()['response']['docs'][0].keys()

# plotting the collection dates

In [None]:
import httpx

url = 'https://central.isample.xyz/isamples_central/thing/select/info'
url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&facet=true&facet.range=producedBy_resultTimeRange&facet.range.start=NOW/YEAR-200YEARS&facet.range.end=NOW/YEAR%2B1YEAR&facet.range.gap=%2B1YEAR'


headers = {
    'accept': 'application/json'
}

response = httpx.get(url, headers=headers)

print(response.json())


In [None]:
response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']

In [None]:

k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
dict(zip(k[::2], k[1::2]))



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data is your response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))


# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract the year from the date
df['Year'] = df['Date'].dt.year

# Count the occurrences of each year
year_counts = df['Year'].value_counts().sort_index()

# Plot the counts vs year
year_counts.plot(kind='line')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count vs Year')
plt.show()


In [None]:
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))

df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])
df.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming df is your DataFrame
df['Date'] = pd.to_datetime(df['Date'])

# deal with log scale
df = df.loc[df['Count'] != 0]

# df['Count'] = df['Count'].replace(0, np.nan)
# df['Count'] = df['Count'].fillna(0.1)

plt.figure(figsize=(10,6))
plt.scatter(df['Date'], df['Count'], color='green', alpha=0.5, s=10)
plt.yscale('log')

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Count over Date')
plt.show()

In [None]:
%%bash

curl -X 'GET' \
  'https://central.isample.xyz/isamples_central/thing/select?facet=true&facet.mincount=0&facet.field=source' \
  -H 'accept: application/json'

In [None]:
field_names = cli.field_names()


In [None]:
len(field_names)

In [None]:
fields = ["source", "hasMaterialCategory", "hasContextCategory"]
facets = cli.facets("*:*", fields)
print(json.dumps(facets, indent=2))

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Sum by axis 2 (hasContextCategory) and print
df = xd.sum(axis=2).to_pandas()
# display transposed
display(df.T)


In [None]:
print(xd.loc["sesar", "rock"].sum())

In [None]:
# Field names in solr
for name in cli.field_names():
    print(name)