In [1]:
import json
import logging
import typing
import urllib.parse
import httpx
import xarray

from urllib.parse import quote

import pandas as pd
from pandas import DataFrame, Series
import numpy as np

import matplotlib.pyplot as plt

from collections import Counter
from isbclient import IsbClient

from itertools import islice


In [2]:
# can we plugin pysolr here?
import pysolr

def my_select(self, params, handler=None):
    """
    :param params:
    :param handler: defaults to self.search_handler (fallback to 'select')
    :return:
    """
    # Returns json docs unless otherwise specified
    params.setdefault("wt", "json")
    custom_handler = handler or self.search_handler
    handler = "select"
    if custom_handler:
        if self.use_qt_param:
            params["qt"] = custom_handler
        else:
            handler = custom_handler

    params_encoded = pysolr.safe_urlencode(params, True)

    # put no effective limit on the size of the query
    if len(params_encoded) < 100000:
        # Typical case.
        path = "%s?%s" % (handler, params_encoded)
        return self._send_request("get", path)
    else:
        # Handles very long queries by submitting as a POST.
        path = "%s" % handler
        headers = {
            "Content-type": "application/x-www-form-urlencoded; charset=utf-8"
        }
        return self._send_request(
            "post", path, body=params_encoded, headers=headers
        )

pysolr.Solr._select = my_select


# The overall iSamples API

*  https://central.isample.xyz/isamples_central/ui is the swagger UI
* https://central.isample.xyz/isamples_central/openapi.json is the swagger file


In [3]:

OPENAPI_URL = 'https://central.isample.xyz/isamples_central/openapi.json'
r = httpx.get(OPENAPI_URL)
r.json()['paths'].keys()

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


dict_keys(['/metrics', '/metrics/', '/thing', '/thing/', '/thing/types', '/thing/select', '/thing/reliquery', '/thing/stream', '/thing/select/info', '/h3_counts/', '/things', '/thing/{identifier}', '/resolve/{identifier}', '/stac_item/{identifier}', '/stac_collection/{filename}', '/things_geojson_heatmap', '/things_leaflet_heatmap', '/related', '/related/'])

# /thing/select: Solr-based select interface

In [4]:
# focus on /thing/select endpoint
r = httpx.get(OPENAPI_URL)
r.json()['paths']['/thing/select']['get']

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


{'summary': 'Get Solr Select',
 'description': 'Send select request to the Solr isb_core_records collection.\n\nSee https://solr.apache.org/guide/8_11/common-query-parameters.html',
 'operationId': 'get_solr_select_thing_select_get',
 'responses': {'200': {'description': 'Successful Response',
   'content': {'application/json': {'schema': {'title': 'Response Get Solr Select Thing Select Get'}}}}}}

In [5]:
# fields used in https://central.isample.xyz/isamples_central/ui

MAJOR_FIELDS = dict([('All text fields', 'searchText'),
 ('Collection Date', 'producedBy_resultTimeRange'),
 ('Context', 'hasContextCategory'),
 ('Identifier', 'id'),
 ('Keywords', 'keywords'),
 ('Label', 'label'),
 ('Material', 'hasMaterialCategory'),
 ('ProducedBy ResultTime',  'producedBy_resultTime'),
 ('ProducedBy SamplingSite PlaceName', 'producedBy_samplingSite_placeName'),
 ('Registrant', 'registrant'),
 ('Source', 'source'),
 ('Source Updated Time', 'sourceUpdatedTime'),
 ('Spatial Query', 'producedBy_samplingSite_location_rpt'),
 ('Specimen', 'hasSpecimenCategory')])

# default field list to return

FL_DEFAULT = ('searchText',
 'authorizedBy',
 'producedBy_resultTimeRange',
 'hasContextCategory',
 'curation_accessContraints',
 'curation_description_text',
 'curation_label',
 'curation_location',
 'curation_responsibility',
 'description_text',
 'id',
 'informalClassification',
 'keywords',
 'label',
 'hasMaterialCategory',
 'producedBy_description_text',
 'producedBy_hasFeatureOfInterest',
 'producedBy_label',
 'producedBy_responsibility',
 'producedBy_resultTime',
 'producedBy_samplingSite_description_text',
 'producedBy_samplingSite_label',
 'producedBy_samplingSite_location_elevationInMeters',
 'producedBy_samplingSite_location_latitude',
 'producedBy_samplingSite_location_longitude',
 'producedBy_samplingSite_placeName',
 'registrant',
 'samplingPurpose',
 'source',
 'sourceUpdatedTime',
 'producedBy_samplingSite_location_rpt',
 'hasSpecimenCategory')

FACET_FIELDS_DEFAULT = ('authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory')

# https://solr.apache.org/guide/8_11/faceting.html#range-faceting

FACET_RANGE_FIELDS_DEFAULT = {
    'facet.range': 'producedBy_resultTimeRange',
    'f.producedBy_resultTimeRange.facet.range.gap': '+1YEARS',
    'f.producedBy_resultTimeRange.facet.range.start': '1800-01-01T00:00:00Z',
    'f.producedBy_resultTimeRange.facet.range.end': '2023-01-01T00:00:00Z',
}


In [6]:
from datetime import datetime

def format_date_for_solr(date_str):
    # Assuming the input is in a format like 'YYYY-MM-DD' or already in ISO 8601
    # Modify this part if your input format is different
    try:
        # If the date is already in ISO 8601 format, return as is
        datetime.fromisoformat(date_str)
        return date_str
    except ValueError:
        # Convert from 'YYYY-MM-DD' to ISO 8601
        return datetime.strptime(date_str, '%Y-%m-%d').isoformat() + 'Z'

def create_date_range_query(start_str, end_str):
    # If start_str or end_str is blank, use '*' for open-ended range
    start_date = format_date_for_solr(start_str) if start_str else '*'
    end_date = format_date_for_solr(end_str) if end_str else '*'
    return f'[{start_date} TO {end_date}]'

def filter_null_values(d):
    return {k:v for k,v in d.items() if v is not None}

ISAMPLES_SOURCES = ['SESAR',
    'OPENCONTEXT',
    'GEOME',
    'SMITHSONIAN',
]

params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}

class IsbClient2(IsbClient):
    def __init__(self, url='https://central.isample.xyz/isamples_central/thing'):
        super().__init__()
        self.url = url
        self.solr = pysolr.Solr(self.url, always_commit=True)

    def _fq_from_kwargs(self, collection_date_start=1800, collection_date_end='NOW', source=None, **kwargs):
        # build fq
        # 'field1': quote('value with spaces and special characters like &'),

        # source is a tuple drawing from ['SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN']
        if source is not None:
            source = " or ".join([f'"{s}"' for s in source])

        filter_conditions = {
            
            'producedBy_resultTimeRange': f'[{collection_date_start} TO {collection_date_end}]',  # Range query
            'source': source,  # Boolean logic
            '-relation_target':'*'
        }

        filter_conditions.update(kwargs)

        # Convert to list of fq strings
        fq = [f'{field}:{value}' for field, value in filter_null_values(filter_conditions).items()]

        # fq = ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*']
        return fq

    def default_search(self, q='*:*',
                       fl = FL_DEFAULT,
                       fq = None,
                       start=0, rows=20, 
                       facet_field = FACET_FIELDS_DEFAULT,
                       sort =  'id ASC',
                       **kwargs):
        
        if fq is None:
            fq = self._fq_from_kwargs()
    
        params={
            'q': q, 
            'fl': fl,
            'start':start, 
            'rows': rows, 
            'fq': fq,
            'facet': 'on',
            'facet.field': facet_field,
            'cursorMark': '*',
            'sort': sort,
        }

        # update params with kwargs
        params.update(kwargs)
            
        # sort = 'id ASC'
        # yield from self.solr.search(q='*:*', fl=FL_DEFAULT, sort='id ASC',cursorMark='*')
        print(params)

        results = self.solr.search(**params)
        return results



widgetized forms to formulate query

display number of hits
display facets

map
dataframe


In [7]:
cli = IsbClient2()
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',))
query = cli.default_search(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)
# print number of hits
print (len(query))
results = islice(query, 300)

df = DataFrame(results)
df.head()

{'q': '*:*', 'fl': ('searchText', 'authorizedBy', 'producedBy_resultTimeRange', 'hasContextCategory', 'curation_accessContraints', 'curation_description_text', 'curation_label', 'curation_location', 'curation_responsibility', 'description_text', 'id', 'informalClassification', 'keywords', 'label', 'hasMaterialCategory', 'producedBy_description_text', 'producedBy_hasFeatureOfInterest', 'producedBy_label', 'producedBy_responsibility', 'producedBy_resultTime', 'producedBy_samplingSite_description_text', 'producedBy_samplingSite_label', 'producedBy_samplingSite_location_elevationInMeters', 'producedBy_samplingSite_location_latitude', 'producedBy_samplingSite_location_longitude', 'producedBy_samplingSite_placeName', 'registrant', 'samplingPurpose', 'source', 'sourceUpdatedTime', 'producedBy_samplingSite_location_rpt', 'hasSpecimenCategory'), 'start': 0, 'rows': 100, 'fq': ['producedBy_resultTimeRange:[1800 TO NOW]', 'source:"OPENCONTEXT"', '-relation_target:*'], 'facet': 'on', 'facet.field'

INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800+TO+NOW%5D&fq=source%3A%

853229


INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800+TO+NOW%5D&fq=source%3A%

Unnamed: 0,id,sourceUpdatedTime,label,searchText,description_text,hasContextCategory,hasMaterialCategory,hasSpecimenCategory,keywords,producedBy_label,...,producedBy_responsibility,producedBy_resultTime,producedBy_resultTimeRange,producedBy_samplingSite_label,producedBy_samplingSite_placeName,producedBy_samplingSite_location_rpt,producedBy_samplingSite_location_latitude,producedBy_samplingSite_location_longitude,source,informalClassification
0,ark:/28722/k2000024f,2023-07-16T20:17:56Z,VdM20060209,"[VdM20060209, early bce/ce: -535.0 | late bce/...",early bce/ce: -535.0 | late bce/ce: -50.0 | up...,[Site of past human activities],"[Anthropogenic metal, Natural solid material, ...",[Physical specimen],"[Europe, Italy, Vescovado di Murlo, Upper Vesc...",Murlo,...,[creator: Anthony Tuck],2012-12-28T00:00:00Z,2012-12-28T00:00:00Z,Europe/Italy/Vescovado di Murlo/Upper Vescovad...,"[Europe, Italy, Vescovado di Murlo, Upper Vesc...",POINT (11.391122443138563 43.171122385167024),43.171124,11.391123,OPENCONTEXT,
1,ark:/28722/k2000025x,2018-07-09T09:50:02Z,PC 19680385,"[PC 19680385, early bce/ce: -700.0 | late bce/...",early bce/ce: -700.0 | late bce/ce: -535.0 | u...,[Site of past human activities],[Anthropogenic material],[Artifact],"[Italy, Poggio Civitate, Civitate A, Civitate ...",Murlo,...,[creator: Anthony Tuck],2012-12-28T00:00:00Z,2012-12-28T00:00:00Z,Italy/Poggio Civitate/Civitate A/Civitate A 2I...,"[Italy, Poggio Civitate, Civitate A, Civitate ...",POINT (11.400837596717457 43.15319356129963),43.153194,11.400838,OPENCONTEXT,
2,ark:/28722/k2000027w,2020-03-28T01:27:10Z,Bone Ref# 3008,"[Bone Ref# 3008, early bce/ce: -6700.0 | late ...",early bce/ce: -6700.0 | late bce/ce: -6000.0 |...,[Site of past human activities],[Biogenic non organic material],"[Artifact, Organism product, Organism part]","[Turkey, Pınarbaşı, Site B, Stratum 6, Context...",Pınarbaşı 1994: Animal Bones,...,[creator: Denise Carruthers],2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,Turkey/Pınarbaşı/Site B/Stratum 6/Context BCF,"[Turkey, Pınarbaşı, Site B, Stratum 6, Context...",POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,
3,ark:/28722/k2000028c,2020-03-28T01:25:22Z,Bone Ref# 2237,"[Bone Ref# 2237, early bce/ce: -6700.0 | late ...",early bce/ce: -6700.0 | late bce/ce: -6000.0 |...,[Site of past human activities],[Biogenic non organic material],"[Artifact, Organism product, Organism part]","[Turkey, Pınarbaşı, Site B, Stratum 6, Context...",Pınarbaşı 1994: Animal Bones,...,[creator: Denise Carruthers],2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,Turkey/Pınarbaşı/Site B/Stratum 6/Context BBJ,"[Turkey, Pınarbaşı, Site B, Stratum 6, Context...",POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,
4,ark:/28722/k2000029v,2020-03-28T01:22:29Z,Bone Ref# 991,"[Bone Ref# 991, early bce/ce: -6700.0 | late b...",early bce/ce: -6700.0 | late bce/ce: -6000.0 |...,[Site of past human activities],[Biogenic non organic material],"[Artifact, Organism product, Organism part]","[Turkey, Pınarbaşı, Site B, Stratum 5, Stratum...",Pınarbaşı 1994: Animal Bones,...,[creator: Denise Carruthers],2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,Turkey/Pınarbaşı/Site B/Stratum 5/Stratum 6/Co...,"[Turkey, Pınarbaşı, Site B, Stratum 5, Stratum...",POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,[Sheep or goat]


In [8]:

cli = IsbClient2()
cli.record_count("*:*")


INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A


6347972

In [9]:
assert set(query.raw_response.keys()) == set(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])

In [10]:
# dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts'].keys()

query.raw_response['facet_counts']['facet_fields'].keys()

dict_keys(['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'])

In [11]:
query.raw_response['facet_counts']['facet_fields']['source']

['OPENCONTEXT', 853229, 'GEOME', 0, 'SESAR', 0, 'SMITHSONIAN', 0]

In [12]:
from ipytree import Tree, Node
from ipyleaflet import Map, Marker
from ipywidgets import HBox, link, Layout

m = Map(center=[47.51, 4.04], zoom=4, layout=Layout(height='400px'))
tree = Tree()
tree.layout.width = '40%'
box = HBox([tree, m])

markers_node = Node('Markers')
tree.add_node(markers_node)

layers_node = Node('Layers', icon='map')
tree.add_node(layers_node)

cities = [
    {'name': 'London', 'location': [51.5074, 0.1278]},
    {'name': 'Paris', 'location': [48.8566, 2.3522]},
    {'name': 'Barcelona', 'location': [41.31, 2.109]}
]

for city in cities:
    marker = Marker(location=city.get('location'))
    node = Node(city.get('name'), icon='map-marker')

    link((marker, 'visible'), (node, 'selected'))

    m.add_layer(marker)
    markers_node.add_node(node)

box

HBox(children=(Tree(layout=Layout(width='40%'), nodes=(Node(name='Markers', nodes=(Node(icon='map-marker', nam…

In [13]:
# query.raw_response.keys() --> dict_keys(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])
query.raw_response['facet_counts']['facet_ranges'].keys()

dict_keys(['producedBy_resultTimeRange'])

In [15]:
# keys: dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts']['facet_ranges'].keys()

dict_keys(['producedBy_resultTimeRange'])

In [16]:
# 'responseHeader', 'index', 'schema', 'info'
r = cli._request("thing/select/info")
r.keys()

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select/info "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select/info


dict_keys(['responseHeader', 'index', 'schema', 'info'])

In [17]:
r['schema']['fields'].keys()

dict_keys(['_nest_parent_', '_nest_path_', '_root_', '_text_', '_version_', 'authorizedBy', 'compliesWith', 'curation_accessContraints', 'curation_description', 'curation_description_text', 'curation_label', 'curation_location', 'curation_responsibility', 'description', 'description_text', 'hasContextCategory', 'hasContextCategoryConfidence', 'hasMaterialCategory', 'hasMaterialCategoryConfidence', 'hasSpecimenCategory', 'hasSpecimenCategoryConfidence', 'id', 'indexUpdatedTime', 'informalClassification', 'isb_core_id', 'keywords', 'label', 'producedBy_description', 'producedBy_description_text', 'producedBy_hasFeatureOfInterest', 'producedBy_isb_core_id', 'producedBy_label', 'producedBy_responsibility', 'producedBy_resultTime', 'producedBy_resultTimeRange', 'producedBy_samplingSite_description', 'producedBy_samplingSite_description_text', 'producedBy_samplingSite_label', 'producedBy_samplingSite_location_bb', 'producedBy_samplingSite_location_bb__maxX', 'producedBy_samplingSite_location

In [18]:
# timeout internal server error
if False:
    r = cli._request("thing/types")

In [19]:
# types and classnames for all the fields on the system
Counter([(x['type'], r['schema']['types'][x['type']]['className']) for x in r['schema']['fields'].values()])

Counter({('string', 'org.apache.solr.schema.StrField'): 49,
         ('pfloat', 'org.apache.solr.schema.FloatPointField'): 8,
         ('text_en', 'org.apache.solr.schema.TextField'): 5,
         ('pdouble', 'org.apache.solr.schema.DoublePointField'): 4,
         ('pdate', 'org.apache.solr.schema.DatePointField'): 3,
         ('text_general', 'org.apache.solr.schema.TextField'): 2,
         ('_nest_path_', 'org.apache.solr.schema.NestPathField'): 1,
         ('plong', 'org.apache.solr.schema.LongPointField'): 1,
         ('date_range', 'org.apache.solr.schema.DateRangeField'): 1,
         ('bbox', 'org.apache.solr.schema.BBoxField'): 1,
         ('boolean', 'org.apache.solr.schema.BoolField'): 1,
         ('location', 'org.apache.solr.schema.LatLonPointSpatialField'): 1,
         ('location_rpt',
          'org.apache.solr.schema.SpatialRecursivePrefixTreeFieldType'): 1})

In [20]:
# e.g, I for Indexed, T for Tokenized, S for Stored, etc.
r['info']['key']

# ['fields', 'dynamicFields', 'uniqueKeyField', 'similarity', 'types']
r['schema'].keys()

# get the fields -- 78 of them
print ("number of fields", len(r['schema']['fields'].keys()))

field_names = cli.field_names()
print("number of field names (another way to access)", len(field_names))

print ("types for the major fields")
[(k,v['type'], r['schema']['types'][v['type']]['className'] ) for (k,v) in r['schema']['fields'].items() if k in MAJOR_FIELDS.values()]

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select/info "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select/info


number of fields 78
number of field names (another way to access) 78
types for the major fields


[('hasContextCategory', 'string', 'org.apache.solr.schema.StrField'),
 ('hasMaterialCategory', 'string', 'org.apache.solr.schema.StrField'),
 ('hasSpecimenCategory', 'string', 'org.apache.solr.schema.StrField'),
 ('id', 'string', 'org.apache.solr.schema.StrField'),
 ('keywords', 'string', 'org.apache.solr.schema.StrField'),
 ('label', 'string', 'org.apache.solr.schema.StrField'),
 ('producedBy_resultTime', 'pdate', 'org.apache.solr.schema.DatePointField'),
 ('producedBy_resultTimeRange',
  'date_range',
  'org.apache.solr.schema.DateRangeField'),
 ('producedBy_samplingSite_location_rpt',
  'location_rpt',
  'org.apache.solr.schema.SpatialRecursivePrefixTreeFieldType'),
 ('producedBy_samplingSite_placeName',
  'string',
  'org.apache.solr.schema.StrField'),
 ('registrant', 'string', 'org.apache.solr.schema.StrField'),
 ('searchText', 'text_en', 'org.apache.solr.schema.TextField'),
 ('source', 'string', 'org.apache.solr.schema.StrField'),
 ('sourceUpdatedTime', 'pdate', 'org.apache.solr.

In [21]:
from urllib.parse import urlparse, parse_qs

url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&fl=searchText%20authorizedBy%20producedBy_resultTimeRange%20hasContextCategory%20curation_accessContraints%20curation_description_text%20curation_label%20curation_location%20curation_responsibility%20description_text%20id%20informalClassification%20keywords%20label%20hasMaterialCategory%20producedBy_description_text%20producedBy_hasFeatureOfInterest%20producedBy_label%20producedBy_responsibility%20producedBy_resultTime%20producedBy_samplingSite_description_text%20producedBy_samplingSite_label%20producedBy_samplingSite_location_elevationInMeters%20producedBy_samplingSite_location_latitude%20producedBy_samplingSite_location_longitude%20producedBy_samplingSite_placeName%20registrant%20samplingPurpose%20source%20sourceUpdatedTime%20producedBy_samplingSite_location_rpt%20hasSpecimenCategory&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A(%22OPENCONTEXT%22%20OR%20%22SESAR%22)&fq=-relation_target%3A*&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet.range=producedBy_resultTimeRange&facet.range.gap=%2B1YEARS&facet.range.start=1800-01-01T00:00:00Z&facet.range.end=2023-01-01T00:00:00Z&f.registrant.facet.sort=count&f.source.facet.sort=index&rows=20&facet.limit=-1&facet.sort=index&&start=0&facet=on&wt=json'

parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# The result is a dictionary where each key is associated with a list of values.
# You can iterate over this dictionary to process your parameters as needed.
for key, values in query_params.items():
    print(f"{key}: {values}")

# If you need each key to have a single value (taking the first value if multiple are present),
# you can do the following:
single_value_params = {key: values[0] for key, values in query_params.items()}
print(single_value_params)


q: ['*:*']
fl: ['searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory']
fq: ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:("OPENCONTEXT" OR "SESAR")', '-relation_target:*']
facet.field: ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory']
facet.range: ['pro

In [22]:
# simplest query -- default

cli._request("thing/select", params={'q': '*:*', 'start':0, 'rows': 10, 
        'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*'],
        'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
        'facet': 'on',
        })

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&start=0&rows=10&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A%28OPENCONTEXT%20or%20SESAR%29&fq=-relation_target%3A%2A&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet=on "HTTP/1.1 200 OK"
INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&start=0&rows=10&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A%28OPENCONTEXT%20or%20SESAR%29&fq=-relation_target%3A%2A&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet=on


{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 2244,
  'params': {'q': '*:*',
   'facet.field': ['authorizedBy',
    'hasContextCategory',
    'hasMaterialCategory',
    'registrant',
    'source',
    'hasSpecimenCategory'],
   'fl': 'id',
   'start': '0',
   'fq': ['producedBy_resultTimeRange:[1800 TO 2023]',
    'source:(OPENCONTEXT or SESAR)',
    '-relation_target:*'],
   'rows': '10',
   'facet': 'on',
   'wt': 'json'}},
 'response': {'numFound': 5540537,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': 'IGSN:IESER000J'},
   {'id': 'IGSN:IESER000K'},
   {'id': 'IGSN:IESER000L'},
   {'id': 'IGSN:IELL10002'},
   {'id': 'IGSN:IENWU0PBP'},
   {'id': 'IGSN:IENWU0SDP'},
   {'id': 'IGSN:IESER0009'},
   {'id': 'IGSN:IESER0008'},
   {'id': 'IGSN:IESER0006'},
   {'id': 'IGSN:IESER000B'}]},
 'facet_counts': {'facet_queries': {},
  'facet_fields': {'authorizedBy': [],
   'hasContextCategory': ['Not Provided',
    3982952,
    'Site of past human activities',
    8



Let's break down these parameters, which are used for querying a Solr search engine. Solr is an open-source search platform that provides a wide range of capabilities for text search and faceted search, among other features.

q: This parameter specifies the query. Here, *:* is a wildcard query, meaning it matches all documents in the Solr index.


[fl](https://solr.apache.org/guide/8_11/common-query-parameters.html#fl-field-list-parameter): This stands for "field list". It specifies the fields to return in the result. In your query, a long list of fields like searchText, authorizedBy, producedBy_resultTimeRange, etc., are included. Only these fields will be returned for each document in the search results.

fq: This is the "filter query". It filters the results returned by the main query (q) without influencing the score. Here, there are three filters applied:

> producedBy_resultTimeRange:[1800 TO 2023] filters documents to those produced between the years 1800 and 2023.
source:(OPENCONTEXT) filters documents where the source field matches "OPENCONTEXT".
-relation_target:* excludes documents where the relation_target field exists.
facet.field: Faceting is used to aggregate data based on a field. This parameter specifies the fields for which you want to see facet counts. Facets on fields like authorizedBy, hasContextCategory, etc., are requested.


facet.range, facet.range.gap, facet.range.start, and facet.range.end: These parameters are used for range faceting. You are faceting on the producedBy_resultTimeRange field, starting from "1800-01-01T00:00:00Z" to "2023-01-01T00:00:00Z", with a gap of "+1YEARS". This means it will provide counts for each year in this range.

f.registrant.facet.sort and f.source.facet.sort: These are sorting instructions for the facets. The registrant facet is sorted by count, and the source facet is sorted by index.

rows: This specifies the number of documents to return. In your query, it's set to 20.

facet.limit: This limits the number of facet values returned for each facet field. -1 means no limit.

facet.sort: It dictates how to sort the facet fields. Here, it's sorted by index.

start: This is the offset in the complete result set for pagination. It tells Solr where to start in the list of results (useful for paging through results).

facet: When set to 'on', it enables faceting.

wt: This stands for "writer type" and specifies the output format. Here, 'json' indicates that the response should be in JSON format.

In [23]:
import httpx


url = "https://central.isample.xyz/isamples_central/thing/select"
params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}
headers = {
    'Accept': 'application/json',
    'User-Agent': 'raymondyee.net'
}

# keys in response: 'responseHeader', 'response', 'facet_counts'
response = httpx.get(url, params=params, headers=headers)


INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText%20authorizedBy%20producedBy_resultTimeRange%20hasContextCategory%20curation_accessContraints%20curation_description_text%20curation_label%20curation_location%20curation_responsibility%20description_text%20id%20informalClassification%20keywords%20label%20hasMaterialCategory%20producedBy_description_text%20producedBy_hasFeatureOfInterest%20producedBy_label%20producedBy_responsibility%20producedBy_resultTime%20producedBy_samplingSite_description_text%20producedBy_samplingSite_label%20producedBy_samplingSite_location_elevationInMeters%20producedBy_samplingSite_location_latitude%20producedBy_samplingSite_location_longitude%20producedBy_samplingSite_placeName%20registrant%20samplingPurpose%20source%20sourceUpdatedTime%20producedBy_samplingSite_location_rpt%20hasSpecimenCategory&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A%28OPENCONTEXT%29&fq=-relation_target

In [24]:
# get back parameters that went into the query and some basic metadata
response.json()['responseHeader']

{'zkConnected': True,
 'status': 0,
 'QTime': 383,
 'params': {'facet.range': 'producedBy_resultTimeRange',
  'facet.field': ['authorizedBy',
   'hasContextCategory',
   'hasMaterialCategory',
   'registrant',
   'source',
   'hasSpecimenCategory'],
  'facet.range.gap': '+1YEARS',
  'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_r

In [25]:
# 'numFound', 'start', 'numFoundExact', 'docs'
response.json()['response'].keys()

(response.json()['response']['numFound'], response.json()['response']['numFoundExact'])


(853229, True)

In [26]:
response.json()['response']['docs'][0].keys()

dict_keys(['id', 'sourceUpdatedTime', 'label', 'searchText', 'description_text', 'hasContextCategory', 'hasMaterialCategory', 'hasSpecimenCategory', 'keywords', 'producedBy_label', 'producedBy_description_text', 'producedBy_responsibility', 'producedBy_resultTime', 'producedBy_resultTimeRange', 'producedBy_samplingSite_label', 'producedBy_samplingSite_placeName', 'producedBy_samplingSite_location_rpt', 'producedBy_samplingSite_location_latitude', 'producedBy_samplingSite_location_longitude', 'source'])

# plotting the collection dates

In [31]:
import httpx

url = 'https://central.isample.xyz/isamples_central/thing/select/info'
url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&facet=true&facet.range=producedBy_resultTimeRange&facet.range.start=NOW/YEAR-200YEARS&facet.range.end=NOW/YEAR%2B1YEAR&facet.range.gap=%2B1YEAR'


headers = {
    'accept': 'application/json'
}

response = httpx.get(url, headers=headers)

print(response.json())


INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?q=*:*&facet=true&facet.range=producedBy_resultTimeRange&facet.range.start=NOW%2FYEAR-200YEARS&facet.range.end=NOW%2FYEAR%252B1YEAR&facet.range.gap=%252B1YEAR "HTTP/1.1 200 OK"


{'responseHeader': {'zkConnected': True, 'status': 400, 'QTime': 1, 'params': {'facet.range': 'producedBy_resultTimeRange', 'q': '*:*', 'facet.range.gap': '%2B1YEAR', 'fl': 'id', 'start': '0', 'rows': '10', 'facet': 'true', 'wt': 'json', 'facet.range.start': 'NOW/YEAR-200YEARS', 'facet.range.end': 'NOW/YEAR%2B1YEAR'}}, 'error': {'metadata': ['error-class', 'org.apache.solr.common.SolrException', 'root-error-class', 'java.text.ParseException'], 'msg': "Can't parse value NOW/YEAR%2B1YEAR for field: producedBy_resultTimeRange", 'code': 400}}


In [32]:
response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']

KeyError: 'facet_counts'

In [29]:

k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
dict(zip(k[::2], k[1::2]))



KeyError: 'facet_counts'

In [30]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data is your response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))


# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract the year from the date
df['Year'] = df['Date'].dt.year

# Count the occurrences of each year
year_counts = df['Year'].value_counts().sort_index()

# Plot the counts vs year
year_counts.plot(kind='line')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count vs Year')
plt.show()


KeyError: 'facet_counts'

In [None]:
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))

df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])
df.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming df is your DataFrame
df['Date'] = pd.to_datetime(df['Date'])

# deal with log scale
df = df.loc[df['Count'] != 0]

# df['Count'] = df['Count'].replace(0, np.nan)
# df['Count'] = df['Count'].fillna(0.1)

plt.figure(figsize=(10,6))
plt.scatter(df['Date'], df['Count'], color='green', alpha=0.5, s=10)
plt.yscale('log')

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Count over Date')
plt.show()

In [None]:
%%bash

curl -X 'GET' \
  'https://central.isample.xyz/isamples_central/thing/select?facet=true&facet.mincount=0&facet.field=source' \
  -H 'accept: application/json'

In [None]:
field_names = cli.field_names()


In [None]:
len(field_names)

In [None]:
fields = ["source", "hasMaterialCategory", "hasContextCategory"]
facets = cli.facets("*:*", fields)
print(json.dumps(facets, indent=2))

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Sum by axis 2 (hasContextCategory) and print
df = xd.sum(axis=2).to_pandas()
# display transposed
display(df.T)


In [None]:
print(xd.loc["sesar", "rock"].sum())

In [None]:
# Field names in solr
for name in cli.field_names():
    print(name)