# Counting Registered IGSNs

This page counts the number of IGSN registrations by year and by OAI-PMH set.

First, find the set names as reported by the OAI-PMH service. 

In [12]:
import pprint
import igsn_lib.oai

baseurl = "https://doidb.wdc-terra.org/igsnoaip/oai"

# Get a list of sets from the OAI-PMH service
svc = igsn_lib.oai.getSickle(baseurl)
set_list = igsn_lib.oai.listSets(svc, get_counts=False)
set_specs = list(map(lambda e: e['setSpec'], set_list))
pprint.pprint(sorted(set_specs))

['ANDS',
 'ANDS.AUSCOPE',
 'ANDS.AUSCOPE.REFQUALITY',
 'ANDS.REFQUALITY',
 'CNRS',
 'CNRS.CNRS',
 'CNRS.CNRS.REFQUALITY',
 'CNRS.REFQUALITY',
 'CSIRO',
 'CSIRO.CSIRO',
 'CSIRO.CSIRO.REFQUALITY',
 'CSIRO.REFQUALITY',
 'GEOAUS',
 'GEOAUS.AU',
 'GEOAUS.AU.REFQUALITY',
 'GEOAUS.REFQUALITY',
 'GFZ',
 'GFZ.GFZ',
 'GFZ.GFZ.REFQUALITY',
 'GFZ.REFQUALITY',
 'IEDA',
 'IEDA.REFQUALITY',
 'IEDA.SESAR',
 'IEDA.SESAR.REFQUALITY',
 'IFREMER',
 'IFREMER.IGSN',
 'IFREMER.IGSN.REFQUALITY',
 'IFREMER.REFQUALITY',
 'KIGAM',
 'KIGAM.DC',
 'KIGAM.DC.REFQUALITY',
 'KIGAM.REFQUALITY',
 'MARUM',
 'MARUM.HB',
 'MARUM.HB.REFQUALITY',
 'MARUM.REFQUALITY',
 'REFQUALITY',
 'UKI',
 'UKI.REFQUALITY',
 'UKI.RZ',
 'UKI.RZ.REFQUALITY']


Grab just the top level names for grouping the counts:

In [13]:
base_names = set(map(lambda s: s.split('.',1)[0], set_specs))
pprint.pprint(base_names)

{'ANDS',
 'CNRS',
 'CSIRO',
 'GEOAUS',
 'GFZ',
 'IEDA',
 'IFREMER',
 'KIGAM',
 'MARUM',
 'REFQUALITY',
 'UKI'}


Get the number of matches for each year x set name combination:

In [14]:
import time
import concurrent.futures
import pandas as pd
import numpy as np

def loadCount(service, bname, year):
    dfrom = f"{year}-01-01T00:00:00Z"
    duntil = f"{year+1}-01-01T00:00:00Z"
    count = igsn_lib.oai.recordCount(service, set_spec=bname, tfrom=dfrom, tuntil=duntil)
    return (bname, year, count, )

years = [2012,2013,2014,2015,2016,2017,2018,2019,2020]
columns = ['Registrant', 'Year', 'Count', ]
data = []
tstart = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = []
    for bname in base_names:
        for cyear in years:
            futures.append(executor.submit(loadCount, svc, bname, cyear))
    for future in concurrent.futures.as_completed(futures):
        row = future.result()
        data.append(row)
df = pd.DataFrame.from_records(data, columns=columns)
print(f"Took {time.time()-tstart:0.3} seconds")

Took 8.08 seconds


Now generate a table pivoting on `Year x Registrant` (set name):

In [15]:
p = df.pivot(index='Registrant', columns='Year', values='Count')
p['Total'] = p.sum(axis=1)
p.loc['Total'] = p.sum()
p

Year,2012,2013,2014,2015,2016,2017,2018,2019,2020,Total
Registrant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ANDS,0,0,0,0,0,0,3,3110,89,3202
CNRS,0,0,0,0,0,0,0,0,886,886
CSIRO,0,0,0,535,91,31400,610,602,10,33248
GEOAUS,0,0,0,1788859,295283,98237,56448,127423,1244356,3610606
GFZ,0,0,0,0,0,7582,493,1834,472,10381
IEDA,0,71023,2,3998296,36951,167195,51270,54289,150834,4529860
IFREMER,0,0,0,0,0,0,4254,11218,3152,18624
KIGAM,0,0,0,0,0,0,738,641,18,1397
MARUM,0,0,0,0,1015,97729,18434,19409,0,136587
REFQUALITY,0,0,0,0,0,0,0,0,0,0
