In [1]:
import json
import logging
import typing
import urllib.parse
import httpx
import xarray
import pysolr

from urllib.parse import quote

import pandas as pd
from pandas import DataFrame, Series
import numpy as np

import matplotlib.pyplot as plt

from collections import Counter
from isbclient import IsbClient, MAJOR_FIELDS, FL_DEFAULT, FACET_FIELDS_DEFAULT, FACET_RANGE_FIELDS_DEFAULT, ISAMPLES_SOURCES
from isbclient import format_date_for_solr, create_date_range_query, filter_null_values
from isbclient import monkey_patch_select, SWITCH_TO_POST


from itertools import islice

logging.getLogger().setLevel(logging.INFO)

# monkeypatch pysolr?
monkey_patch_select(active=True)
SWITCH_TO_POST = 10000


# The overall iSamples API

*  https://central.isample.xyz/isamples_central/ui is the swagger UI
* https://central.isample.xyz/isamples_central/openapi.json is the swagger file


In [2]:

OPENAPI_URL = 'https://central.isample.xyz/isamples_central/openapi.json'
r = httpx.get(OPENAPI_URL)
r.json()['paths'].keys()

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


dict_keys(['/metrics', '/metrics/', '/thing', '/thing/', '/thing/types', '/thing/select/', '/thing/select', '/thing/reliquery', '/thing/stream', '/thing/select/info', '/h3_counts/', '/things', '/thing/{identifier}', '/resolve/{identifier}', '/stac_item/{identifier}', '/stac_collection/{filename}', '/things_geojson_heatmap', '/things_leaflet_heatmap', '/related', '/related/'])

# /thing/select: Solr-based select interface

In [3]:
# focus on /thing/select endpoint
r = httpx.get(OPENAPI_URL)
r.json()['paths']['/thing/select']['get']

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/openapi.json "HTTP/1.1 200 OK"


{'summary': 'Get Solr Select',
 'operationId': 'get_solr_select_thing_select_get',
 'responses': {'200': {'description': 'Successful Response',
   'content': {'application/json': {'schema': {'title': 'Response Get Solr Select Thing Select Get'}}}}}}

In [4]:
class IsbClient2(IsbClient):
    def __init__(self, url='https://central.isample.xyz/isamples_central/thing'):
        super().__init__()
        self.url = url
        self.solr = pysolr.Solr(self.url, always_commit=True)

    def _fq_from_kwargs(self, collection_date_start=1800, collection_date_end='NOW', source=None, **kwargs):
        """ 
        builds fq from a set of defaults and kwargs
        TO DO: incorporate kwargs into fq -- kwargs are essentially ignored right now
        # https://github.com/django-haystack/pysolr/issues/58
        Also, we need to be able to handle multiple values for a single key and clarify what this function should return -- a tuple or a list (and not dict)
        """
        # build fq
        # 'field1': quote('value with spaces and special characters like &'),

        # source is a tuple drawing from ['SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN']
        if source is not None:
            source = " or ".join([f'"{s}"' for s in source])

        filter_conditions = {
            
            'producedBy_resultTimeRange': f'[{collection_date_start} TO {collection_date_end}]',  # Range query
            'source': source,  # Boolean logic
            '-relation_target':'*'
        }

        filter_conditions.update(kwargs)

        # Convert to list of fq strings
        fq = [f'{field}:{value}' for field, value in filter_null_values(filter_conditions).items()]

        # fq = ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*']
        return fq

    def default_search_params(self, q='*:*',
                       fl = FL_DEFAULT,
                       fq = None,
                       start=0, rows=20, 
                       facet_field = FACET_FIELDS_DEFAULT,
                       sort =  'id ASC',
                       **kwargs):
        
        if fq is None:
            fq = self._fq_from_kwargs()
    
        params={
            'q': q, 
            'fl': fl,
            'start':start, 
            'rows': rows, 
            'fq': fq,
            'facet': 'on',
            'facet.field': facet_field,
            'cursorMark': '*',
            'sort': sort,
        }

        # update params with kwargs
        params.update(kwargs)
        return params
                
    def search(self, params=None, **kwargs):
        if params is None:
            params = self.default_search_params(**kwargs)

        # give an option to pick how to do the search
        if kwargs.get('thingselect', False):
            return self._request("thing/select", params)
        else:
            return self.solr.search(**params)



widgetized forms to formulate query

display number of hits
display facets

map
dataframe


In [5]:
cli = IsbClient2()
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',))
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

# use the /thing/select endpoint directly

query = cli.search(params=params, thingselect=True)
# print number of hits
print (len(query))
results = islice(query, 300)

df = DataFrame(results)
df.head()

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800%20TO%20NOW%5D&fq

4


Unnamed: 0,0
0,responseHeader
1,response
2,nextCursorMark
3,facet_counts


In [6]:
cli = IsbClient2()
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',))
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

# use pysolr to get the results
query = cli.search(params=params)
# print number of hits
print (len(query))
results = islice(query, 300)

df = DataFrame(results)
df.head()

INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800+TO+NOW%5D&fq=source%3A%

882128


INFO:pysolr:Finished 'https://central.isample.xyz/isamples_central/thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800+TO+NOW%5D&fq=source%3A%

Unnamed: 0,id,sourceUpdatedTime,label,searchText,description_text,hasContextCategory,hasMaterialCategory,hasSpecimenCategory,keywords,registrant,...,producedBy_resultTime,producedBy_resultTimeRange,producedBy_samplingSite_description_text,producedBy_samplingSite_label,producedBy_samplingSite_location_rpt,producedBy_samplingSite_location_latitude,producedBy_samplingSite_location_longitude,source,producedBy_label,producedBy_samplingSite_placeName
0,ark:/28722/k2000024f,2023-10-07T07:53:03Z,Object VdM20060209,"[Object VdM20060209, 'early bce/ce': -535.0 | ...",'early bce/ce': -535.0 | 'late bce/ce': -50.0 ...,[Site of past human activities],"[mat:rock, mat:anthropogenicmetal, mat:biogeni...",[physicalspecimen],"[Architecture, Human settlements, Subsistence ...",[],...,2012-12-28T00:00:00Z,2012-12-28T00:00:00Z,https://opencontext.org/subjects/167674e7-1eda...,Vescovado di Murlo,POINT (11.391122443138563 43.171122385167024),43.171124,11.391123,OPENCONTEXT,,
1,ark:/28722/k2000025x,2023-10-07T06:55:40Z,Architectural Element PC 19680385,"[Architectural Element PC 19680385, 'early bce...",'early bce/ce': -700.0 | 'late bce/ce': -535.0...,[Site of past human activities],[anyanthropogenicmaterial],[artifact],"[Architecture, Human settlements, Subsistence ...",[],...,2012-12-28T00:00:00Z,2012-12-28T00:00:00Z,https://opencontext.org/subjects/871b9ef8-bc68...,Poggio Civitate,POINT (11.400837596717457 43.15319356129963),43.153194,11.400838,OPENCONTEXT,,
2,ark:/28722/k2000027w,2023-10-04T06:00:39Z,Animal Bone Bone Ref# 3008,"[Animal Bone Bone Ref# 3008, 'early bce/ce': -...",'early bce/ce': -6700.0 | 'late bce/ce': -6000...,[Site of past human activities],[biogenicnonorganicmaterial],"[ornament, container, architectural element]","[Agriculture, Animal remains (Archaeology), Ar...",[],...,2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,https://opencontext.org/subjects/2767a2d2-a050...,Pınarbaşı,POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,,
3,ark:/28722/k2000028c,2023-10-04T05:58:40Z,Animal Bone Bone Ref# 2237,"[Animal Bone Bone Ref# 2237, 'early bce/ce': -...",'early bce/ce': -6700.0 | 'late bce/ce': -6000...,[Site of past human activities],[biogenicnonorganicmaterial],"[container, ornament, architectural element]","[Agriculture, Animal remains (Archaeology), Ar...",[],...,2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,https://opencontext.org/subjects/2767a2d2-a050...,Pınarbaşı,POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,,
4,ark:/28722/k2000029v,2023-10-04T05:55:15Z,Animal Bone Bone Ref# 991,"[Animal Bone Bone Ref# 991, 'early bce/ce': -6...",'early bce/ce': -6700.0 | 'late bce/ce': -6000...,[Site of past human activities],[biogenicnonorganicmaterial],"[container, ornament, architectural element]","[Agriculture, Animal remains (Archaeology), Ar...",[],...,2013-03-04T00:00:00Z,2013-03-04T00:00:00Z,https://opencontext.org/subjects/2767a2d2-a050...,Pınarbaşı,POINT (33.018551 37.49432),37.49432,33.01855,OPENCONTEXT,,


In [7]:
params

{'q': '*:*',
 'fl': ('searchText',
  'authorizedBy',
  'producedBy_resultTimeRange',
  'hasContextCategory',
  'curation_accessContraints',
  'curation_description_text',
  'curation_label',
  'curation_location',
  'curation_responsibility',
  'description_text',
  'id',
  'informalClassification',
  'keywords',
  'label',
  'hasMaterialCategory',
  'producedBy_description_text',
  'producedBy_hasFeatureOfInterest',
  'producedBy_label',
  'producedBy_responsibility',
  'producedBy_resultTime',
  'producedBy_samplingSite_description_text',
  'producedBy_samplingSite_label',
  'producedBy_samplingSite_location_elevationInMeters',
  'producedBy_samplingSite_location_latitude',
  'producedBy_samplingSite_location_longitude',
  'producedBy_samplingSite_placeName',
  'registrant',
  'samplingPurpose',
  'source',
  'sourceUpdatedTime',
  'producedBy_samplingSite_location_rpt',
  'hasSpecimenCategory'),
 'start': 0,
 'rows': 100,
 'fq': ['producedBy_resultTimeRange:[1800 TO NOW]',
  'source

In [8]:
# write out the call to iSamples using httpx to compare get vs post

import httpx
ISB_SERVER = "https://central.isample.xyz/isamples_central/"

r = httpx.request('GET', f'{ISB_SERVER}/thing/select', params=params)
r.json()['response']['numFound']

INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central//thing/select?q=%2A%3A%2A&fl=searchText&fl=authorizedBy&fl=producedBy_resultTimeRange&fl=hasContextCategory&fl=curation_accessContraints&fl=curation_description_text&fl=curation_label&fl=curation_location&fl=curation_responsibility&fl=description_text&fl=id&fl=informalClassification&fl=keywords&fl=label&fl=hasMaterialCategory&fl=producedBy_description_text&fl=producedBy_hasFeatureOfInterest&fl=producedBy_label&fl=producedBy_responsibility&fl=producedBy_resultTime&fl=producedBy_samplingSite_description_text&fl=producedBy_samplingSite_label&fl=producedBy_samplingSite_location_elevationInMeters&fl=producedBy_samplingSite_location_latitude&fl=producedBy_samplingSite_location_longitude&fl=producedBy_samplingSite_placeName&fl=registrant&fl=samplingPurpose&fl=source&fl=sourceUpdatedTime&fl=producedBy_samplingSite_location_rpt&fl=hasSpecimenCategory&start=0&rows=100&fq=producedBy_resultTimeRange%3A%5B1800%20TO%20NOW%5D&f

882128

In [9]:
# make a post request version

from urllib.parse import urlencode

headers = {
    "Content-type": "application/x-www-form-urlencoded; charset=utf-8"
}

params_encoded = urlencode(params)
r = httpx.post(f'{ISB_SERVER}/thing/select', data=params_encoded, headers=headers)
r

INFO:httpx:HTTP Request: POST https://central.isample.xyz/isamples_central//thing/select "HTTP/1.1 200 OK"


<Response [200 OK]>

In [10]:
r.json()

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 0,
  'params': {'q': '*:*',
   'fl': 'id',
   'start': '0',
   'rows': '10',
   'wt': 'json'}},
 'response': {'numFound': 6387537,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': 'IGSN:IESER000J'},
   {'id': 'IGSN:IESER000K'},
   {'id': 'IGSN:IESER000L'},
   {'id': 'IGSN:IELL10002'},
   {'id': 'IGSN:IENWU0PBP'},
   {'id': 'IGSN:IENWU0SDP'},
   {'id': 'IGSN:IESER0009'},
   {'id': 'IGSN:IESER0008'},
   {'id': 'IGSN:IESER0006'},
   {'id': 'IGSN:IESER000B'}]}}

In [None]:
assert set(query.raw_response.keys()) == set(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])

In [None]:
# dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts'].keys()

query.raw_response['facet_counts']['facet_fields'].keys()

In [None]:
query.raw_response['facet_counts']['facet_fields']['source']

In [None]:
from ipytree import Tree, Node
from ipyleaflet import Map, Marker
from ipywidgets import HBox, link, Layout

m = Map(center=[47.51, 4.04], zoom=4, layout=Layout(height='400px'))
tree = Tree()
tree.layout.width = '40%'
box = HBox([tree, m])

markers_node = Node('Markers')
tree.add_node(markers_node)

layers_node = Node('Layers', icon='map')
tree.add_node(layers_node)

cities = [
    {'name': 'London', 'location': [51.5074, 0.1278]},
    {'name': 'Paris', 'location': [48.8566, 2.3522]},
    {'name': 'Barcelona', 'location': [41.31, 2.109]}
]

for city in cities:
    marker = Marker(location=city.get('location'))
    node = Node(city.get('name'), icon='map-marker')

    link((marker, 'visible'), (node, 'selected'))

    m.add_layer(marker)
    markers_node.add_node(node)

box

In [None]:
# query.raw_response.keys() --> dict_keys(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])
query.raw_response['facet_counts']['facet_ranges'].keys()

In [None]:
# keys: dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts']['facet_ranges'].keys()

In [None]:
# 'responseHeader', 'index', 'schema', 'info'
r = cli._request("thing/select/info")
r.keys()

In [None]:
r['schema']['fields'].keys()

In [None]:
# timeout internal server error
if False:
    r = cli._request("thing/types")

In [None]:
# types and classnames for all the fields on the system
Counter([(x['type'], r['schema']['types'][x['type']]['className']) for x in r['schema']['fields'].values()])

In [None]:
# e.g, I for Indexed, T for Tokenized, S for Stored, etc.
r['info']['key']

# ['fields', 'dynamicFields', 'uniqueKeyField', 'similarity', 'types']
r['schema'].keys()

# get the fields -- 78 of them
print ("number of fields", len(r['schema']['fields'].keys()))

field_names = cli.field_names()
print("number of field names (another way to access)", len(field_names))

print ("types for the major fields")
[(k,v['type'], r['schema']['types'][v['type']]['className'] ) for (k,v) in r['schema']['fields'].items() if k in MAJOR_FIELDS.values()]

In [None]:
from urllib.parse import urlparse, parse_qs

url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&fl=searchText%20authorizedBy%20producedBy_resultTimeRange%20hasContextCategory%20curation_accessContraints%20curation_description_text%20curation_label%20curation_location%20curation_responsibility%20description_text%20id%20informalClassification%20keywords%20label%20hasMaterialCategory%20producedBy_description_text%20producedBy_hasFeatureOfInterest%20producedBy_label%20producedBy_responsibility%20producedBy_resultTime%20producedBy_samplingSite_description_text%20producedBy_samplingSite_label%20producedBy_samplingSite_location_elevationInMeters%20producedBy_samplingSite_location_latitude%20producedBy_samplingSite_location_longitude%20producedBy_samplingSite_placeName%20registrant%20samplingPurpose%20source%20sourceUpdatedTime%20producedBy_samplingSite_location_rpt%20hasSpecimenCategory&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A(%22OPENCONTEXT%22%20OR%20%22SESAR%22)&fq=-relation_target%3A*&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet.range=producedBy_resultTimeRange&facet.range.gap=%2B1YEARS&facet.range.start=1800-01-01T00:00:00Z&facet.range.end=2023-01-01T00:00:00Z&f.registrant.facet.sort=count&f.source.facet.sort=index&rows=20&facet.limit=-1&facet.sort=index&&start=0&facet=on&wt=json'

parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# The result is a dictionary where each key is associated with a list of values.
# You can iterate over this dictionary to process your parameters as needed.
for key, values in query_params.items():
    print(f"{key}: {values}")

# If you need each key to have a single value (taking the first value if multiple are present),
# you can do the following:
single_value_params = {key: values[0] for key, values in query_params.items()}
print(single_value_params)


In [None]:
# simplest query -- default

cli._request("thing/select", params={'q': '*:*', 'start':0, 'rows': 10, 
        'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*'],
        'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
        'facet': 'on',
        })



Let's break down these parameters, which are used for querying a Solr search engine. Solr is an open-source search platform that provides a wide range of capabilities for text search and faceted search, among other features.

q: This parameter specifies the query. Here, *:* is a wildcard query, meaning it matches all documents in the Solr index.


[fl](https://solr.apache.org/guide/8_11/common-query-parameters.html#fl-field-list-parameter): This stands for "field list". It specifies the fields to return in the result. In your query, a long list of fields like searchText, authorizedBy, producedBy_resultTimeRange, etc., are included. Only these fields will be returned for each document in the search results.

fq: This is the "filter query". It filters the results returned by the main query (q) without influencing the score. Here, there are three filters applied:

> producedBy_resultTimeRange:[1800 TO 2023] filters documents to those produced between the years 1800 and 2023.
source:(OPENCONTEXT) filters documents where the source field matches "OPENCONTEXT".
-relation_target:* excludes documents where the relation_target field exists.
facet.field: Faceting is used to aggregate data based on a field. This parameter specifies the fields for which you want to see facet counts. Facets on fields like authorizedBy, hasContextCategory, etc., are requested.


facet.range, facet.range.gap, facet.range.start, and facet.range.end: These parameters are used for range faceting. You are faceting on the producedBy_resultTimeRange field, starting from "1800-01-01T00:00:00Z" to "2023-01-01T00:00:00Z", with a gap of "+1YEARS". This means it will provide counts for each year in this range.

f.registrant.facet.sort and f.source.facet.sort: These are sorting instructions for the facets. The registrant facet is sorted by count, and the source facet is sorted by index.

rows: This specifies the number of documents to return. In your query, it's set to 20.

facet.limit: This limits the number of facet values returned for each facet field. -1 means no limit.

facet.sort: It dictates how to sort the facet fields. Here, it's sorted by index.

start: This is the offset in the complete result set for pagination. It tells Solr where to start in the list of results (useful for paging through results).

facet: When set to 'on', it enables faceting.

wt: This stands for "writer type" and specifies the output format. Here, 'json' indicates that the response should be in JSON format.

In [None]:
import httpx


url = "https://central.isample.xyz/isamples_central/thing/select"
params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}
headers = {
    'Accept': 'application/json',
    'User-Agent': 'raymondyee.net'
}

# keys in response: 'responseHeader', 'response', 'facet_counts'
response = httpx.get(url, params=params, headers=headers)


In [None]:
# get back parameters that went into the query and some basic metadata
response.json()['responseHeader']

In [None]:
# 'numFound', 'start', 'numFoundExact', 'docs'
response.json()['response'].keys()

(response.json()['response']['numFound'], response.json()['response']['numFoundExact'])


In [None]:
response.json()['response']['docs'][0].keys()

# plotting the collection dates

In [None]:
import httpx

cli = IsbClient2()
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',))
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)


url = 'https://central.isample.xyz/isamples_central/thing/select/info'
url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&facet=true&facet.range=producedBy_resultTimeRange&facet.range.start=NOW/YEAR-200YEARS&facet.range.end=NOW/YEAR&facet.range.gap=YEAR'


headers = {
    'accept': 'application/json'
}

response = httpx.get('https://central.isample.xyz/isamples_central/thing/select', headers=headers, params=params)

print(response.json())


In [None]:
response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']

In [None]:

k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
dict(zip(k[::2], k[1::2]))



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data is your response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))


# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract the year from the date
df['Year'] = df['Date'].dt.year

# Count the occurrences of each year
year_counts = df['Year'].value_counts().sort_index()

# Plot the counts vs year
year_counts.plot(kind='line')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count vs Year')
plt.show()


In [None]:
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))

df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])
df.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming df is your DataFrame
df['Date'] = pd.to_datetime(df['Date'])

# deal with log scale
df = df.loc[df['Count'] != 0]

# df['Count'] = df['Count'].replace(0, np.nan)
# df['Count'] = df['Count'].fillna(0.1)

plt.figure(figsize=(10,6))
plt.scatter(df['Date'], df['Count'], color='green', alpha=0.5, s=10)
plt.yscale('log')

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Count over Date')
plt.show()

In [None]:
%%bash

curl -X 'GET' \
  'https://central.isample.xyz/isamples_central/thing/select?facet=true&facet.mincount=0&facet.field=source' \
  -H 'accept: application/json'

In [None]:
field_names = cli.field_names()


In [None]:
len(field_names)

In [None]:
fields = ["source", "hasMaterialCategory", "hasContextCategory"]
facets = cli.facets("*:*", fields)
print(json.dumps(facets, indent=2))

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Get counts of values grouping by three dimsions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot("*:*", dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Sum by axis 2 (hasContextCategory) and print
df = xd.sum(axis=2).to_pandas()
# display transposed
display(df.T)


In [None]:
print(xd.loc["sesar", "rock"].sum())

In [None]:
# Field names in solr
for name in cli.field_names():
    print(name)