<a href="https://colab.research.google.com/github/hubmapconsortium/ccf-linkml/blob/main/ASCT%2BB_Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdflib



In [2]:
import requests

def request_get(url, headers=None):
    """
    Performs a get request that provides a (somewhat) useful error message.
    """
    try:
        response = requests.get(url, headers)
    except ImportError:
        raise ImportError("Couldn't retrieve the data, check your URL")
    else:
        return response

def get_json(url):
    """Returns request in JSON (dict) format"""
    headers = {"Content-Type": "application/json; charset=utf-8"}
    return request_get(url, headers).json() 



In [3]:
class TableDataResultParser:
  """
  Represents the TableDataResult dictionary from the ASCT+B API endpoint
  """
  def get_data(self, o):
    return o['data']
  
  def get_metadata(self, o):
    return o['metadata']

  def get_row_number(self, o):
    return o['rowNumber']

  def get_anatomical_structures(self, o):
    return o['anatomical_structures']

  def get_cell_types(self, o):
    return o['cell_types']

  def get_biomarkers(self, o, type=None):
    key = "biomarkers"
    if type:
      key = f"biomarkers_{type}"
    return o[key]

  def get_references(self, o):
    return o['references']

  def get_id(self, o):
    return o['id']

  def get_label(self, o):
    return o['rdfs_label']

  def get_name(self, o):
    return o['name']

  def get_doi(self, o):
    return o['doi']

  def get_data_doi(self, o):
    return o['data_doi']

In [4]:
import re

from rdflib.graph import Graph, Namespace, URIRef, Literal, BNode, Collection
from rdflib.namespace import OWL, RDF, RDFS, XSD, DCTERMS

class AsctbDataTransformer:
  """
  Represents the algorithm to transform ASCT+B data to RDF
  """

  CCF = Namespace("http://purl.org/ccf/")

  def __init__(self, parser):
    self.graph = Graph()
    self.parser = parser

  @staticmethod
  def new(parser=TableDataResultParser()):
    return AsctbDataTransformer(parser)

  def handle_concept_id(self, obj):
    concept_id = self.parser.get_id(obj)
    term = None
    if concept_id:
      term = URIRef(self.expand_id(concept_id))
    return term

  def to_rdf(self, json):
    """
    Transforms ASCT+B data from format JSON to RDF.
    """
    self.graph.bind("ccf", self.CCF)
    
    data = self.parser.get_data(json)
    metadata = self.parser.get_metadata(json)

    table_doi = self.parser.get_data_doi(metadata)
    table = URIRef(table_doi)

    self.graph.add((table, RDF.type, self.CCF.AsctbTable))

    for row in data:
      record = BNode()
      self.graph.add((record, RDF.type, self.CCF.AsctbRecord))
      self.graph.add((table, self.CCF.contains, record))

      as_list = self.create_rdf_list(
          [self.parser.get_id(ast)
              for ast in self.parser.get_anatomical_structures(row)])
      self.graph.add((record, self.CCF.contains_anatomical_structures, as_list))

      ct_list = self.create_rdf_list(
          [self.parser.get_id(ct)
              for ct in self.parser.get_cell_types(row)])
      self.graph.add((record, self.CCF.contains_cell_types, ct_list))

      bm_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row)])
      self.graph.add((record, self.CCF.contains_biomarkers, bm_list))

      bm_gene_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row, "gene")])
      self.graph.add((record, self.CCF.contains_gene_biomarkers, bm_gene_list))

      bm_protein_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row, "protein")])
      self.graph.add((record, self.CCF.contains_protein_biomarkers, bm_protein_list))

      bm_lipids_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row, "lipids")])
      self.graph.add((record, self.CCF.contains_lipids_biomarkers, bm_lipids_list))

      bm_meta_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row, "meta")])
      self.graph.add((record, self.CCF.contains_meta_biomarkers, bm_meta_list))

      bm_prot_list = self.create_rdf_list(
          [self.parser.get_id(bm)
              for bm in self.parser.get_biomarkers(row, "prot") if bm])
      self.graph.add((record, self.CCF.contains_prot_biomarkers, bm_prot_list))

      ref_list = self.create_rdf_list(
          [self.parser.get_doi(ref) 
              for ref in self.parser.get_references(row) if ref])
      self.graph.add((record, self.CCF.contains_references, ref_list))

    return self.graph.serialize(format="turtle")

  def create_rdf_list(self, id_list):
    list_node = BNode()
    list_members = [URIRef(self.expand_id(id)) for id in id_list if id]
    Collection(self.graph, list_node, list_members)
    return list_node

  def expand_id(self, str):
    if "FMA:" in str:
      return self.expand_fma_id(str)
    elif "UBERON:" in str:
      return self.expand_uberon_id(str)
    elif "PCL:" in str:
      return self.expand_pcl_id(str)
    elif "CL:" in str:
      return self.expand_cl_id(str)
    elif "LMHA:" in str:
      return self.expand_lmha_id(str)
    elif "HGNC:" in str:
      return self.expand_hgnc_id(str)
    elif "ASCTB-TEMP:" in str:
      return self.expand_asctb_temp_id(str) 
    elif "DOI:" in str:
      return self.expand_doi(str)    
    else:
      return f"https://purl.org/ccf/ASCTB-TEMP_{str}"

  def expand_fma_id(self, str):
    fma_pattern = re.compile("FMA:", re.IGNORECASE)
    return fma_pattern.sub(
        "http://purl.org/sig/ont/fma/fma", str)

  def expand_uberon_id(self, str):
    uberon_pattern = re.compile("UBERON:", re.IGNORECASE)
    return uberon_pattern.sub(
        "http://purl.obolibrary.org/obo/UBERON_", str)

  def expand_pcl_id(self, str):
    pcl_pattern = re.compile("PCL:", re.IGNORECASE)
    return pcl_pattern.sub(
        "http://purl.obolibrary.org/obo/PCL_", str)
          
  def expand_cl_id(self, str):
    cl_pattern = re.compile("CL:", re.IGNORECASE)
    return cl_pattern.sub(
        "http://purl.obolibrary.org/obo/CL_", str)

  def expand_lmha_id(self, str):
    lmha_pattern = re.compile("LMHA:", re.IGNORECASE)
    return lmha_pattern.sub(
        "http://purl.obolibrary.org/obo/LMHA_", str)

  def expand_hgnc_id(self, str):
    hgnc_pattern = re.compile("HGNC:", re.IGNORECASE)
    return hgnc_pattern.sub(
        "http://identifiers.org/hgnc/", str)
                  
  def expand_asctb_temp_id(self, str):
    asctb_temp_pattern = re.compile("ASCTB-TEMP:", re.IGNORECASE)
    return asctb_temp_pattern.sub(
        "https://purl.org/ccf/ASCTB-TEMP_", str)
      
  def expand_doi(self, str):
    doi_pattern = re.compile("DOI:\\s*", re.IGNORECASE)
    return doi_pattern.sub("http://doi.org/", str)

In [5]:
BASE_URL = "https://asctb-api.herokuapp.com"
VERSION = "v2"
SHEETID = "1NMfu1bEGNFcTYTFT-jCao_lSbFD8n0ti630iIpRj-hw" # Kidney_v1.2 DRAFT
GID = "949267305"

In [6]:
url = f"{BASE_URL}/{VERSION}/{SHEETID}/{GID}"

In [7]:
response = get_json(url)

In [8]:
transformer = AsctbDataTransformer.new()
output = transformer.to_rdf(response);
print(output)

@prefix ccf: <http://purl.org/ccf/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .

<https://doi.org/10.48539/HBM248.CBJV.556> a ccf:AsctbTable ;
    ccf:contains [ a ccf:AsctbRecord ;
            ccf:contains_anatomical_structures ( <http://purl.obolibrary.org/obo/UBERON_0002113> <http://purl.obolibrary.org/obo/UBERON_0005215> <http://purl.obolibrary.org/obo/UBERON_0003918> ) ;
            ccf:contains_biomarkers ( <http://identifiers.org/hgnc/11553> <http://identifiers.org/hgnc/130> <http://identifiers.org/hgnc/7569> <http://identifiers.org/hgnc/8033> <http://identifiers.org/hgnc/234> <http://identifiers.org/hgnc/26295> ) ;
            ccf:contains_cell_types ( <http://purl.obolibrary.org/obo/CL_1001318> ) ;
            ccf:contains_gene_biomarkers ( <http://identifiers.org/hgnc/11553> <http://identifiers.org/hgnc/130> <http://identifiers.org/hgnc/7569> <http://identifiers.org/hgnc/8033> <http://identifiers.org/hgnc/234> <http://identifiers.org/hgnc/26295> ) ;
       