# Populate findrefs

Backfilling References of the 2024-07-01-LTS census release

In [1]:
!lamin load laminlabs/cellxgene

[92m→[0m connected lamindb: laminlabs/cellxgene


In [None]:
import lamindb as ln
import findrefs as fr
import cellxgene_lamin as cxg

ln.track("qDPEGjwgZcQB0000")

[92m→[0m connected lamindb: laminlabs/cellxgene




In [10]:
census_collections = cxg.dev.get_collections_from_cxg()

In [None]:
census_collections[0]

[{'collection_id': 'db70986c-7d91-49fe-a399-a4730be394ac',
  'collection_url': 'https://cellxgene.cziscience.com/collections/db70986c-7d91-49fe-a399-a4730be394ac',
  'collection_version_id': '261b9ea2-7c86-4fb8-a561-f23a2b06420d',
  'consortia': [],
  'contact_email': 'korbinian.traeuble@helmholtz-munich.de',
  'contact_name': 'Korbinian Träuble',
  'created_at': '2024-10-05T11:27:07+00:00',
  'curator_name': 'James Chaffer',
  'datasets': [{'assay': [{'label': "10x 3' v2",
      'ontology_term_id': 'EFO:0009899'},
     {'label': "10x 3' v3", 'ontology_term_id': 'EFO:0009922'}],
    'dataset_id': '72955cdb-bd92-4135-aa52-21f33f9640db',
    'dataset_version_id': '999a6b92-46ca-498e-b1ee-5fc43b6988ef',
    'disease': [{'label': 'atherosclerosis',
      'ontology_term_id': 'MONDO:0005311'}],
    'organism': [{'label': 'Homo sapiens',
      'ontology_term_id': 'NCBITaxon:9606'}],
    'suspension_type': ['cell'],
    'tissue': [{'label': 'carotid artery segment',
      'ontology_term_id': '

In [None]:
from datetime import datetime

for census_collection in census_collections:
    if not census_collection:
        continue
    
    # Use the top-level published_at if publisher_metadata is None
    published_at = None
    if census_collection.get('published_at'):
        try:
            published_at = datetime.fromisoformat(census_collection['published_at'].replace('Z', '+00:00'))
        except (ValueError, TypeError):
            pass
    elif census_collection.get('publisher_metadata') and census_collection['publisher_metadata'].get('published_at'):
        try:
            published_at = datetime.fromtimestamp(census_collection['publisher_metadata']['published_at'])
        except (ValueError, TypeError):
            pass
    
    authors = None
    if census_collection.get('publisher_metadata') and census_collection['publisher_metadata'].get('authors'):
        authors = [
            f"{author.get('given', '')} {author.get('family', '')}".strip()
            for author in census_collection['publisher_metadata']['authors']
        ]
    
    reference = fr.Reference(
        name=census_collection.get('name'),
        url=census_collection.get('collection_url'),
        doi=census_collection.get('doi'),
        description=census_collection.get('description'),
        authors=authors,
        published_at=published_at
    ).save()
    
    id = census_collection.get('collection_id')
    registered_collection = ln.Collection.filter(reference=id, is_latest=True).one_or_none()
    if registered_collection:
        registered_collection.references.set(reference)

In [None]:
ln.finish()