In [131]:
import os
#os.environ['IVOA_REGISTRY']="http://vao.stsci.edu/RegTAP/TapService.aspx"

import pyvo as vo
import warnings
# There are a number of relatively unimportant warnings that show up, so for now, suppress them:
warnings.filterwarnings("ignore", module="astropy.nddata.blocks.*")
warnings.filterwarnings("ignore", module="pyvo.utils.xml.*")
warnings.filterwarnings("ignore", module="urllib3.connectionpool.*")
import re
from astropy.table import Table
from astropy.io import fits
import astropy.coordinates as coord
coords = coord.SkyCoord.from_name("m51")

import requests
session=requests.Session()
session.verify=False

#  Set these debuglevels to 1 to see traffic details
import logging
log = logging.getLogger('urllib3')
from http.client import HTTPConnection
HTTPConnection.debuglevel = 0
from http.client import HTTPSConnection
HTTPSConnection.debuglevel = 0

import sys
sys.tracebacklimit = 0
#T Dower said:  " ForRegTAP, the preferred URL is now 
#  https://mast.stsci.edu/vo-tap/api/v0.1/registry. 
#  OAI-PMH is still on the old system."
navo_new_regtap = vo.dal.TAPService('https://mast.stsci.edu/vo-tap/api/v0.1/registry')
navo_old_regtap = vo.dal.TAPService('https://vao.stsci.edu/RegTAP/TapService.aspx')
gavo_regtap = vo.dal.TAPService('https://dc.zah.uni-heidelberg.de/__system__/tap/run')
euvo_regtap = vo.dal.TAPService('https://registry.euro-vo.org/regtap/tap')

from pyvo import registry
from astropy.coordinates import SkyCoord
from astropy import units as u

## Messing around with SIAv2 


Ours has a URL that ends '/sia?table=foobar&'.  And that breaks in the SIA2Service, which grabs its capabilities and then changes the URL to match, which is just /sia, so it fails.  

### GAVO's:  

In [40]:
for r in vo.regsearch(keywords=['gavo','siap version 2']):
    if 'gavo' in r.ivoid:  
        print(r.res_title)
        gavosia=r

GAVO Data Center SIAP Version 2 Service


In [48]:
gavosia.interfaces

[Interface(type='sia2', description='', url='http://dc.g-vo.org/__system__/siap2/sitewide/siap2.xml?'),
 Interface(type='vosi#availability', description='', url='http://dc.g-vo.org/__system__/siap2/sitewide/availability'),
 Interface(type='vosi#capabilities', description='', url='http://dc.g-vo.org/__system__/siap2/sitewide/capabilities'),
 Interface(type='vosi#tables', description='', url='http://dc.g-vo.org/__system__/siap2/sitewide/tableMetadata'),
 Interface(type='tap#aux', description='', url='http://dc.g-vo.org/tap'),
 Interface(type='datalink#links-1.1', description='', url='http://dc.g-vo.org/__system__/obscore/dl/dlmeta')]

In [44]:
gavosia.service

SIA2Service(baseurl : 'http://dc.g-vo.org/__system__/siap2/sitewide/siap2.xml?', description : '')

This shows that GAVO uses the ObsCore data model (well, that's what SIAv2 is) but the access_urls are to the specific product.

In [55]:
results.to_table()['access_url']

0
https://www.plate-archive.org/files/DR3/scans/HAM-GR/GRTr063_y.fits
https://www.plate-archive.org/files/DR3/scans/HAM-GR/GRTr049_y.fits
https://www.plate-archive.org/files/DR3/scans/HAM-GR/GRTr037_y.fits
https://www.plate-archive.org/files/DR3/scans/HAM-GR/GRTr126_y.fits
https://www.plate-archive.org/files/DR3/scans/Tartu/plaat1682.fits
https://www.plate-archive.org/files/DR3/scans/HAM-LA/LA02437_y.fits
https://www.plate-archive.org/files/DR3/scans/Tartu/plaat1580.fits
https://www.plate-archive.org/files/DR3/scans/HAM-GR/GRk0359a_y.fits
https://www.plate-archive.org/files/DR3/scans/Tartu/plaat1604.fits
https://www.plate-archive.org/files/DR3/scans/HAM-LA/LA05110_y.fits


### CDS's

In [13]:
cdscone = cdsr.get_service(service_type="conesearch")
cdscone

SCSService(baseurl : 'https://vizier.cds.unistra.fr/viz-bin/conesearch/V/155/gdr2ap?', description : 'Cone search capability for table V/155/gdr2ap (records of GDR2AP)')

In [18]:
cdscone.capability_description

'Cone search capability for table V/155/gdr2ap (records of GDR2AP)'

In [20]:
cdscone.baseurl

'https://vizier.cds.unistra.fr/viz-bin/conesearch/V/155/gdr2ap?'

In [59]:
for r in vo.regsearch(servicetype='image'):
    if 'cds' in r.ivoid:
        print(r.ivoid)
        cdsr=r
        cdsi=r.get_service(service_type='image')

ivo://cds.vizier/siap


In [34]:
cdsi.capability_description

SIAService(baseurl : 'https://cdsarc.cds.unistra.fr/saadavizier/siaservice?collection=[vizier]&', description : 'None')

In [60]:
cdsr.interfaces

[Interface(type='sia', description=None, url='https://cdsarc.cds.unistra.fr/saadavizier/siaservice?collection=[vizier]&')]

So this is NOT SIA2.  Doesn't help me.

# Registry Spring Cleaning notebook

Following up on Markus' [Confessions of a Registry Janitor](https://blog.g-vo.org/registry-a-janitor-speaks-out.html), I propose some regular checks of the metadata.  We already have checks of the validity of services, for instance, in the Operations group weather reports.  This would be compolementary.

## Check 1:  spot check numbers between different registries

What's the best way to get the current registries?  Testing one of them seems circular.  But the [RofR](https://rofr.ivoa.net) is still pointing to the old NAVO RegTAP.  OTOH, there's a bug in the new one.  

In [3]:
result = registry.search(datamodel="regtap").to_table()
print(result['ivoid','access_urls'])

            ivoid              ...
------------------------------ ...
        ivo://aip.gavo.org/tap ...
ivo://archive.stsci.edu/regtap ...
      ivo://esavo/registry/tap ...
         ivo://org.gavo.dc/tap ...
                ivo://purx/tap ...


In [4]:
def compare( query ):
    # Currently 
    #navo_regtap = vo.dal.TAPService('https://vao.stsci.edu/RegTAP/TapService.aspx')
    #navo_new_regtap = vo.dal.TAPService('https://mast.stsci.edu/vo-tap/api/v0.1/registry')
    navo_regtap = vo.dal.TAPService('https://mast.stsci.edu/vo-tap/api/v0.1/registry')
    gavo_regtap = vo.dal.TAPService('https://dc.zah.uni-heidelberg.de/__system__/tap/run')
    euvo_regtap = vo.dal.TAPService('https://registry.euro-vo.org/regtap/tap')

    for name,regtap in [('NAVO',navo_regtap),('GAVO',gavo_regtap),("EUVO",euvo_regtap)]:
        try:
            sias = regtap.search(query)
            print(f"{name} RegTAP finds {len(sias)}")
            strings=sias.to_table()['ivoid'] + ' "'+ sias.to_table()['cap_description']+'"'
        except Exception as e:
            print(f"{name} RegTAP gives error: {e}")       


So hardcode them for now

In [5]:
#  This syntax isn't processed correctly by NAVO's.  Use Markus' preferred.  
#compare("select count(*) as cnt from rr.capability where standard_id like '%sia%'")
compare("select * from rr.capability where standard_id like 'ivo://ivoa.net/std/sia%'")

NAVO RegTAP finds 477
GAVO RegTAP finds 477
EUVO RegTAP finds 474


In [6]:
compare("select * from rr.capability where standard_id like '%hips%'")

NAVO RegTAP finds 26
GAVO RegTAP finds 576
EUVO RegTAP finds 575


In [7]:
compare("select * from rr.capability where standard_id like '%cone%' and ivoid not like '%vizier%'")

NAVO RegTAP finds 1987
GAVO RegTAP finds 1957
EUVO RegTAP finds 1990


In [8]:
compare("select * from rr.capability where standard_id like '%cone%'  and ivoid like '%vizier%'")

NAVO RegTAP finds 24426


  warn("Partial result set. Potential causes MAXREC, async storage space, etc.",


GAVO RegTAP finds 20000
EUVO RegTAP finds 29909


### Check 1.1 look at the differences

In [9]:
query = "select * from rr.capability where standard_id like 'ivo://ivoa.net/std/sia%'"
def diffs(A, B, C, query):
    #  A is a tuple of ('name',service)
    resultsA = A[1].search(query).to_table()
    resultsB = B[1].search(query).to_table()
    resultsC = C[1].search(query).to_table()

    idsA=[r['ivoid']+'---'+r['cap_description'] for r in resultsA ]
    idsB=[r['ivoid']+'---'+r['cap_description'] for r in resultsB ]
    idsC=[r['ivoid']+'---'+r['cap_description'] for r in resultsC ]

    diffs1 = list( set(idsA) - set(idsB) )
    print(f"Found {len(diffs1)} in {A[0]} not in {B[0]}")
    diffs2 = list( set(idsA) - set(idsC) )
    print(f"Found {len(diffs2)} in {A[0]} not in {C[0]}")
    diffs3 = list( set(idsB) - set(idsC) )
    print(f"Found {len(diffs3)} in {B[0]} not in {C[0]}")

    diffs4 = list( set(idsB) - set(idsA) )
    print(f"Found {len(diffs4)} in {B[0]} not in {A[0]}")
    diffs5 = list( set(idsC) - set(idsA) )
    print(f"Found {len(diffs5)} in {C[0]} not in {A[0]}")
    diffs6 = list( set(idsC) - set(idsB) )
    print(f"Found {len(diffs6)} in {C[0]} not in {B[0]}")

    return [diffs1,diffs2,diffs3,diffs4,diffs5,diffs6]


In [10]:
difflists = diffs( ('NAVO',navo_old_regtap), ('GAVO',gavo_regtap), ('EUVO',euvo_regtap), 
      "select * from rr.capability where standard_id like 'ivo://ivoa.net/std/sia%'" )

Found 7 in NAVO not in GAVO
Found 12 in NAVO not in EUVO
Found 7 in GAVO not in EUVO
Found 7 in GAVO not in NAVO
Found 9 in EUVO not in NAVO
Found 4 in EUVO not in GAVO


In [11]:
print("Services in GAVO not in EUVO")
print("\n".join(difflists[2]))
print("Services in EUVO not in GAVO")
print("\n".join(difflists[5]))

Services in GAVO not in EUVO
ivo://jvo/isas/darts/bsgc---SIA interface for this resource
ivo://jvo/isas/darts/halca/halca_vsop_survey_program_data---SIAv2 service for HALCA AGN survey
ivo://vopdc.obspm/gepi/vopsat---
ivo://vopdc.obspm/gepi/vopsat/esor---
ivo://jvo/subaru/sxds/v1.0---
ivo://vopdc.obspm/gepi/vopsat/srcj---
ivo://jvo/nobeyama---NRO Legacy project : FUGIN, COMING, Star-Formation
Services in EUVO not in GAVO
ivo://uk.ac.cam.ast/int-wfs/images/siap-atlas---
ivo://jacobsuni/__system__/siap2/sitewide---
ivo://irsa.ipac/herschel/images/z0mgs_dust---
ivo://uk.ac.cam.ast/iphas/images/siap---


The hard part is then looking at those and understanding why.  What other information would we want to look at?

## Check 2:  UCDs 

- Check 2a:  are the UCDs valid according to astropy.io.votable.ucd.check_ucd

In [278]:
from astropy.io.votable.ucd import check_ucd
query="""
  select distinct ucd, count(*) as cnt
  from rr.table_column 
  group by ucd 
  order by cnt desc
  """
result = gavo_regtap.search(query)

all_ucds = result.to_table()
invalid_ucds = []
for i,u in enumerate(all_ucds['ucd'].data):
    if not check_ucd(u):
        invalid_ucds.append((u,all_ucds['cnt'][i]))
print(f"Found {len(invalid_ucds)} invalid UCDs")
print(f"  The top 10 bad UCD values by number of instances are")
x=[print(f"{c[0]:25}: {c[1]}") for c in invalid_ucds[0:10] ]

Found 79 invalid UCDs
  The top 10 bad UCD values by number of instances are
                         : 256616
??                       : 30342
vox:image_filesize       : 138
????                     : 70
vox:image_mjdateobs      : 59
image?                   : 49
vox:bandpass_id          : 47
vox:bandpass_hilimit     : 39
vox:bandpass_lolimit     : 39
vox:bandpass_refvalue    : 39


In [295]:
for u in invalid_ucds:
    if 'meta.id' in u[0]:
        print(f"found {u[0]}")

found meta.id:meta.main
found meta.id,meta.main
found meta.id;
found meta.id;meta.main;


In [294]:
type(u)

tuple

In [269]:
invalid_ucds_cv = []
for i,u in enumerate(all_ucds['ucd'].data):
    if not check_ucd(u,check_controlled_vocabulary=True):
        invalid_ucds_cv.append((u,all_ucds['cnt'][i]))
print(f"Found {len(invalid_ucds_cv)} that are not valid under UCD1+ controlled vocabulary")
print(f"  The top 10 bad UCD values by number of instances are")
[print(f"{c[0]:25}: {c[1]}") for c in invalid_ucds_cv[0:10] ]

Found 932 that are not valid under UCD1+ controlled vocabulary
  The top 10 bad UCD values by number of instances are
                         : 256616
??                       : 30342
error                    : 13825
code_misc                : 8538
phot_mag                 : 6291
fit_param                : 4505
obs.field                : 4331
number                   : 3090
id_number                : 2694
phot_intensity_adu       : 2512


[None, None, None, None, None, None, None, None, None, None]

## Messing around with UCDs at different publishers

Getting the publishers with the most resources in the Registry.  Let's check those.  

In [265]:
publishers = gavo_regtap.search("""
    select distinct role_ivoid, count(*) as cnt , role_name
    from rr.res_role 
    where base_role = 'publisher' 
    group by role_ivoid, role_name
    order by cnt desc
    """).to_table()[0:10]
publishers

role_ivoid,cnt,role_name
object,int32,object
ivo://cds,26923,CDS
ivo://nasa.heasarc/asd,1091,NASA/GSFC HEASARC
ivo://irsa.ipac/irsa,563,NASA/IPAC Infrared Science Archive
,244,The GAVO DC team
,217,Planetary Data System
ivo://wfau.roe.ac.uk,124,"WFAU, Institute for Astronomy, University of Edinburgh"
ivo://archive.stsci.edu/stsci-arc,96,Space Telescope Science Institute Archive
ivo://svo.cab,71,SVO CAB
ivo://noirlab.edu,65,NSF NOIRLab Astro Data Lab Team
,57,Paris Astronomical Data Centre


GAVO doesn't have an IVOID for the publisher like the above?  Makes it harder.  Hm.

In [290]:
import pandas as pd
warnings.filterwarnings("ignore", message=".*This pattern is interpreted as a regular expression.*")
for i,p in enumerate(publishers['role_name'].data):
    #  Look at all the metadata from this publisher
    print(f"\nlooking at publisher {p}")

    query = f"""
    select ucd, count(*) as cnt from ( rr.res_role natural join rr.table_column )
    where role_name = '{p}'
    group by ucd 
    """
    ucd_cnt_all = gavo_regtap.search(query)
    if len(ucd_cnt_all) != 0:  
        print(f"    publisher {p} publishes {len(ucd_cnt_all)} distinct UCDs")
    else: 
        print(f"    publisher {p} publishes no column metadata (?)")
        continue #  ?
    df = pd.DataFrame(data={"ucd":ucd_cnt_all.to_table()['ucd'].data.data,"cnt":ucd_cnt_all.to_table()['cnt'].data.data})
    for c in invalid_ucds:  #  invalid_ucds or invalid_ucds_cv (this is huge)
        #  c is a tuple of the string and the count
        if c[0]=='':  
            matches = df['ucd'].astype(str).str.len() == 0
        elif '?' in c[0]:
            matches = df['ucd'].str.contains("?",regex=False)
        else:
            matches = df['ucd'] == c[0]
        ucd_cnt = df[matches]['cnt'].sum() # should only be one 
        if ucd_cnt == 0:  
            continue
        print(f"    UCD '{c[0]}' used {ucd_cnt} times")
        


looking at publisher CDS
    publisher CDS publishes 6868 distinct UCDs
    UCD '' used 59470 times

looking at publisher NASA/GSFC HEASARC
    publisher NASA/GSFC HEASARC publishes 2451 distinct UCDs
    UCD '' used 8596 times

looking at publisher NASA/IPAC Infrared Science Archive
    publisher NASA/IPAC Infrared Science Archive publishes no column metadata (?)

looking at publisher The GAVO DC team
    publisher The GAVO DC team publishes 830 distinct UCDs
    UCD '' used 1373 times
    UCD 'vox:image_filesize' used 44 times
    UCD 'vox:image_mjdateobs' used 2 times

looking at publisher Planetary Data System
    publisher Planetary Data System publishes 41 distinct UCDs
    UCD '' used 2604 times

looking at publisher WFAU, Institute for Astronomy, University of Edinburgh
    publisher WFAU, Institute for Astronomy, University of Edinburgh publishes 855 distinct UCDs
    UCD '' used 302314 times
    UCD '??' used 60549 times
    UCD '????' used 60549 times
    UCD 'image?' used 

In [None]:
query="""
  select distinct ucd, count(*) as cnt
  from rr.table_column 
  group by ucd 
  order by cnt desc
  """
result = gavo_regtap.search(query)

all_ucds = result.to_table()


In [163]:
colons = []
for i,u in enumerate(all_ucds['ucd'].data):
    if ":" in u:
        colons.append((u,all_ucds['cnt'][i]))
print(f"Found {len(colons)} invalid UCDs with a ':' character ")
#[ print(f"{c[0]:25}: {c[1]}") for c in colons]
print("\n".join([f"{c[0]:25}: {c[1]}" for c in colons]))

Found 29 invalid UCDs with a ':' character 
vox:image_filesize       : 138
vox:image_mjdateobs      : 59
vox:bandpass_id          : 47
vox:bandpass_hilimit     : 39
vox:bandpass_lolimit     : 39
vox:bandpass_refvalue    : 39
vox:bandpass_unit        : 39
vox:image_naxes          : 39
vox:image_naxis          : 39
vox:image_pixflags       : 39
vox:image_scale          : 39
vox:stc_coordequinox     : 39
vox:stc_coordrefframe    : 39
vox:wcs_cdmatrix         : 39
vox:wcs_coordprojection  : 39
vox:wcs_coordrefpixel    : 39
vox:wcs_coordrefvalue    : 39
vox:image_title          : 38
meta.id:meta.main        : 12
phys.magabs:em.opt.b     : 1
sia:dataid.collection    : 1
vox:image_accessreference: 1
vox:image_format         : 1
vox:obs_end_time         : 1
vox:obs_start_time       : 1
vox:spectrum_axes        : 1
vox:spectrum_dimeq       : 1
vox:spectrum_scaleq      : 1
vox:spectrum_units       : 1


In [164]:
culprits = []
for i,u in enumerate(all_ucds['ucd'].data):
    if not check_ucd(u,check_controlled_vocabulary=True):
        culprits.append((u,all_ucds['cnt'][i]))
print(f"Found {len(culprits)} that are not valid under UCD1+ controlled vocabulary")
print(f"  The top 10 bad UCD values by number of instances are")
[print(f"{c[0]:25}: {c[1]}") for c in culprits[0:10] ]

Found 932 that are not valid under UCD1+ controlled vocabulary
  The top 10 bad UCD values by number of instances are
                         : 256616
??                       : 30342
error                    : 13825
code_misc                : 8538
phot_mag                 : 6291
fit_param                : 4505
obs.field                : 4328
number                   : 3090
id_number                : 2694
phot_intensity_adu       : 2512


[None, None, None, None, None, None, None, None, None, None]

True

### Check 3:  authors

Have
* Last F.
* Last F., Last2 F.
* Last, F.
* F. Last, Last2. F.

At least where there are commas they are used to separate two authors, rather than "Last, F" or something.

In [None]:
names = gavo_regtap.search("select distinct role_name, count(*) as cnt from rr.res_role where base_role = 'creator' group by role_name").to_table()
names

### Check 4:  subjects and the UAT

In [173]:
subjects = gavo_regtap.search("select res_subject, count(*) as cnt from rr.res_subject group by res_subject order by cnt desc").to_table()
subjects

res_subject,cnt
object,int32
visible-astronomy,7256
galaxies,4353
infrared-photometry,4328
spectroscopy,4289
photometry,3833
radial-velocity,2854
surveys,2758
redshifted,2664
variable-stars,2033
...,...


In [174]:
import urllib.request, json 
with urllib.request.urlopen("https://raw.githubusercontent.com/astrothesaurus/UAT/master/UAT.json") as url:
    uat = json.load(url)

In [175]:
#  Generator that goes through the nested JSON and looks for a key anywhere down in it
def item_generator(json_input, lookup_key):
    if isinstance(json_input, dict):
        for k, v in json_input.items():
            if k == lookup_key:
                yield v
            else:
                yield from item_generator(v, lookup_key)
    elif isinstance(json_input, list):
        for item in json_input:
            yield from item_generator(item, lookup_key)

In [176]:
uat_name_list = [x.lower() for x in item_generator(uat,'name')]
print(f"Found {len(uat_name_list)} names in the UAT")
print(uat_name_list[0:10])

Found 4335 names in the UAT
['astrophysical processes', 'astrophysical magnetism', 'cosmic magnetic fields theory', 'emerging flux tubes', 'magnetic fields', 'geomagnetic fields', 'magnetic anomalies', 'primordial magnetic fields', 'gravitation', 'relativity']


In [182]:
invalid_subjects = []
correct_subjects = []
for i,s in enumerate(subjects['res_subject'].data):
    if s.lower() in uat_name_list:
        correct_subjects.append((s,subjects['cnt'][i]))
    else:
        invalid_subjects.append((s,subjects['cnt'][i]))
print(f"Found {len(invalid_subjects)} Registry res_subject entries \
that are not in the UAT and {len(correct_subjects)} that are.")
print(f"  The top 10 bad subject values by number of instances are")
x=[print(f"{c[0]}: {c[1]}") for c in invalid_subjects[0:10] ]

Found 1014 Registry res_subject entries that are not in the UAT and 240 that are.
  The top 10 bad subject values by number of instances are
visible-astronomy: 7256
infrared-photometry: 4328
radial-velocity: 2854
variable-stars: 2033
Wide-band photometry: 1937
multiple-stars: 1868
x-ray-sources: 1720
open-star-clusters: 1677
chemical-abundances: 1611
infrared-sources: 1432


In [184]:
import re
result = [u for u in uat_name_list if re.search("^star.*",u)]
print(f"Found {len(result)} matches to 'star' such as")
print(result[0:10])

Found 17 matches to 'star' such as
['star-planet interactions', 'starburst galaxies', 'starburst galaxies', 'starburst galaxies', 'star atlases', 'star counts', 'star counts', 'star lore', 'starspots', 'starspots']


In [189]:
matches

0       False
1       False
2       False
3       False
4       False
        ...  
1249    False
1250    False
1251    False
1252    False
1253     True
Name: res_subject, Length: 1254, dtype: bool

In [199]:
publishers['publisher'].data

masked_array(data=['ivo://cds', '', 'ivo://nasa.heasarc/asd',
                   'ivo://irsa.ipac/irsa', 'ivo://wfau.roe.ac.uk',
                   'ivo://archive.stsci.edu/stsci-arc', 'ivo://svo.cab',
                   'ivo://noirlab.edu', 'ivo://cadc.nrc.ca/org',
                   'ivo://china-vo'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object)

In [200]:

##  Too many in CDS Vizier.  
for i,p in enumerate(publishers['publisher'].data):
    if 'cds' in p or p=='':  
        continue
    #  Look at all the metadata from this publisher
    print(f"\nlooking at publisher {p}")
    query = """
    select res_subject, count(*) as cnt from rr.res_subject 
    group by res_subject order by cnt desc
    """
    cnt_all = gavo_regtap.search(query)
    if len(cnt_all) != 0:  
        print(f"    publisher {p} publishes {len(cnt_all)} distinct subjects")
    else: 
        print(f"    publisher {p} publishes no subject metadata (?)")
        continue #  ?
    df = pd.DataFrame(cnt_all)
    for c,r in invalid_subjects:  
        if c=='':  
            matches = df['res_subject'].astype(str).str.len() == 0
        elif '?' in c:
            matches = df['res_subject'].str.contains(rf'[{r"?"}]', regex=True)
        else:
            matches = df['res_subject'].str.contains(c, case=False, na=False)
        foo = df[matches]['cnt'].sum() # should only be one 
        if cnt == 0:  continue
        print(f"    Subject '{c}' used {cnt} times")
    break


looking at publisher ivo://nasa.heasarc/asd
    publisher ivo://nasa.heasarc/asd publishes 1254 distinct subjects
    Subject 'visible-astronomy' used 7256 times
    Subject 'infrared-photometry' used 4328 times
    Subject 'radial-velocity' used 2854 times
    Subject 'variable-stars' used 2369 times
    Subject 'Wide-band photometry' used 1937 times
    Subject 'multiple-stars' used 1868 times
    Subject 'x-ray-sources' used 1720 times
    Subject 'open-star-clusters' used 1677 times
    Subject 'chemical-abundances' used 1611 times
    Subject 'infrared-sources' used 1432 times
    Subject 'galaxy-clusters' used 1414 times
    Subject 'active-galactic-nuclei' used 1396 times
    Subject 'radio-sources' used 1349 times
    Subject 'interstellar-medium' used 1141 times
    Subject 'globular-star-clusters' used 1138 times
    Subject 'line-intensities' used 1126 times
    Subject 'proper-motions' used 1085 times
    Subject 'milky-way-galaxy' used 988 times
    Subject 'infrared-astr

### Check 5:  concepts

In [None]:
reg_uat_concept_list = gavo_regtap.search("select distinct uat_concept from rr.subject_uat").to_table()["uat_concept"].data
print(f"There are {len(reg_uat_concept_list)} distinct uat_concept values in the registry's subject_uat table")

In [None]:
bad=[]
for c in reg_uat_concept_list:
    # lower case and replace - with space
    if c.lower().replace("-"," ") not in uat_name_list:
        bad.append(c)
print(f"There are {len(bad)} concepts not found in the UAT such as:")
print(bad[0:10])

## Testing subjects at HEASARC

In [None]:
hea_subjects = gavo_regtap.search("select top 10 res_subject, count(*) as cnt from rr.res_subject where ivoid ilike '%nasa.heasarc%' group by res_subject order by cnt desc").to_table()
hea_subjects

In [None]:
hea_subjects = gavo_regtap.search("select top 10 res_subject, count(*) as cnt from rr.res_subject where ivoid ilike '%nasa.heasarc%' group by res_subject order by cnt desc").to_table()
culprits = []
correct = []
for i,s in enumerate(hea_subjects['res_subject'].data):
    if s.lower() in uat_name_list:
        correct.append((s,hea_subjects['cnt'][i]))
    else:
        culprits.append((s,subjects['cnt'][i]))
print(f"For HEASARC:  Found {len(culprits)} Registry res_subject entries that are not in the UAT and {len(correct)} that are.")
print(f"  The top 10 bad subject values by number of instances are")
[print(f"{c[0]}: {c[1]}") for c in culprits[0:10] ]

## Checks from Markus' article:

GAVO apparently puts "0/0-11" if there is no coverage information.  Hard to count how many therefore did not have any info or genuinely cover the full sky.

In [None]:
compare("select count(*) as cnt from rr.stc_spatial where cast(coverage as VARCHAR) ilike '0/0-11'")

In [None]:
gavo_regtap.search("select top 5 * from rr.res_role")

In [None]:
publishers = gavo_regtap.search("""
select distinct role_ivoid as publisher, count(*) as cnt 
from rr.res_role 
where base_role = 'publisher' 
group by role_ivoid
order by cnt desc
""").to_table()[0:10]
publishers

In [None]:
culprits = []
for i,u in enumerate(all_ucds['ucd'].data):
    if not check_ucd(u):
        culprits.append((u,all_ucds['cnt'][i]))
print(f"Found {len(culprits)} invalid UCDs")
print(f"  The top 10 bad UCD values by number of instances are")
x=[print(f"{c[0]:25}: {c[1]}") for c in culprits[0:10] ]

In [None]:
def count_bad_ucds( publisher ):
    for i,u in enumerate(all_ucds['ucd'].data):
        if not check_ucd(u):
            culprits.append((u,all_ucds['cnt'][i]))
print(f"Found {len(culprits)} invalid UCDs")
print(f"  The top 10 bad UCD values by number of instances are")
x=[print(f"{c[0]:25}: {c[1]}") for c in culprits[0:10] ]

## To be expanded.  Now what to do with this?  

* Report cross-checks between registries to their admins.  
* Compile a report of issues as above and advertise at IVOA Interop's Registry (or Ops?) session.  
* Compile a report of issues found for each publisher and email them yearly to request updates.  


## Scratch 

In [None]:
from astropy.io.votable.ucd import check_ucd
query="""
  select distinct ucd, count(*) as cnt
  from rr.table_column 
  group by ucd 
  order by cnt desc
  """
result = gavo_regtap.search(query)
all_ucds = result.to_table()
pubs_with_bad_ucds = {}
for i,u in enumerate(culprits['ucd'].data):
    #  For each of the bad UCDs, increment a count for each publisher

In [None]:
result = gavo_regtap.search("select count distinct(role_ivoid) from rr.res_role")
for r in result['role_

In [None]:
sias = gavo_regtap.search("select top 5 * from rr.capability where standard_id like '%sia%'")
sias.to_table()

In [None]:
galex_sia = navo_old_regtap.search("select * from rr.capability where ivoid = 'ivo://archive.stsci.edu/sia/galex' ")
galex_sia.to_table()

In [None]:
querystart = """
    select * from rr.capability 
    where ivoid = 'ivo://archive.stsci.edu/sia/galex' 
    and standard_id ilike 
    """
## Works
#galex_sia = gavo_regtap.search(f"{querystart} '%/std/sia'")
## Does not work
galex_sia = gavo_regtap.search(f"{querystart} '%std/sia'")
galex_sia.to_table()

So the NAVO RegTAP ilike doesn't appreciate slashes?  

In [None]:
compare("select count(*) as cnt from rr.capability where standard_id like '%sia'")

In [106]:
results = vo.regsearch(keywords=['heasarc'],servicetype='tap')
heatap = results[0]

send: b'POST /tap/sync HTTP/1.1\r\nHost: reg.g-vo.org\r\nUser-Agent: pyVO/1.6.2 Python/3.11.12 (Darwin)\r\nAccept-Encoding: gzip, deflate, br, zstd\r\nAccept: */*\r\nConnection: keep-alive\r\nContent-Length: 1606\r\nContent-Type: application/x-www-form-urlencoded\r\n\r\n'
send: b'REQUEST=doQuery&LANG=ADQL&QUERY=SELECT%0Aivoid%2C+res_type%2C+short_name%2C+res_title%2C+content_level%2C+res_description%2C+reference_url%2C+creator_seq%2C+created%2C+updated%2C+rights%2C+content_type%2C+source_format%2C+source_value%2C+region_of_regard%2C+waveband%2C+%0A++ivo_string_agg%28COALESCE%28access_url%2C+%27%27%29%2C+%27%3A%3A%3Apy+VO+sep%3A%3A%3A%27%29+AS+access_urls%2C+%0A++ivo_string_agg%28COALESCE%28standard_id%2C+%27%27%29%2C+%27%3A%3A%3Apy+VO+sep%3A%3A%3A%27%29+AS+standard_ids%2C+%0A++ivo_string_agg%28COALESCE%28intf_type%2C+%27%27%29%2C+%27%3A%3A%3Apy+VO+sep%3A%3A%3A%27%29+AS+intf_types%2C+%0A++ivo_string_agg%28COALESCE%28intf_role%2C+%27%27%29%2C+%27%3A%3A%3Apy+VO+sep%3A%3A%3A%27%29+AS+intf_

In [107]:
heatap.service.get_tap_capability()

send: b'GET /xamin/vo/tap/capabilities HTTP/1.1\r\nHost: heasarc.gsfc.nasa.gov\r\nUser-Agent: pyVO/1.6.2 Python/3.11.12 (Darwin)\r\nAccept-Encoding: gzip, deflate, br, zstd\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Date: Thu, 15 May 2025 20:14:37 GMT
header: Server: WebServer/1.0
header: X-Frame-Options: sameorigin
header: Content-Type: text/xml;charset=UTF-8
header: Content-Length: 2879
header: Content-Security-Policy: frame-ancestors 'self' cms.nasa.gov www.nasa.gov beta.science.nasa.gov smd-cms.nasa.gov spaceplace.nasa.gov science.nasa.gov;
header: Keep-Alive: timeout=15, max=100
header: Connection: Keep-Alive
header: Strict-Transport-Security: max-age=31536000; includeSubDomains
header: Content-Security-Policy: upgrade-insecure-requests


<Capability standardID=ivo://ivoa.net/std/TAP>... 0 validationLevels, 1 interfaces ...</Capability>

In [116]:
#results = vo.regsearch(keywords=['heasarc'],servicetype='image')
xrissia = results[-1]
xrissia.service.__dir__()

['_baseurl',
 '_capability_description',
 '_session',
 '_description',
 '__module__',
 '__doc__',
 '__init__',
 '_get_metadata',
 'description',
 'params',
 'columns',
 'search',
 'create_query',
 'describe',
 'baseurl',
 'capability_description',
 '__repr__',
 '__dict__',
 '__weakref__',
 '__new__',
 '__hash__',
 '__str__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__reduce_ex__',
 '__reduce__',
 '__getstate__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']