In [0]:
!pip install cerberus
!pip install pyyaml
!pip install rdflib
!pip install urllib

In [0]:
schema_content = '''
overview:
  type: dict
  schema:
    article_link: # an official weblink for the dataset
      type: string
      empty: True
      nullable: True
    dataset_name: # name of the dataset to be utilized for referring this data card
      type: string
      empty: True
      nullable: True
    data_modality: # modality of the dataset
      type: integer
      empty: True
      nullable: True
    domain: # domains with which the dataset relates to
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    file_format: # format in which it exists in `article_links`
      type: string
      empty: True
      nullable: True
    file_name: # name of the dataset file in `article_links`
      type: string
      empty: True
      nullable: True
    file_repository_link: # direct link to the dataset file
      type: string
      empty: True
      nullable: True
    keywords: # keywords associated with the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    languages: # languages associated with the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    last_update_date:  # date when the dataset was last updated
      type: date
      empty: True
      nullable: True
    model_cards: # models which use the dataset for training or evaluation
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    release_date: # date when the dataset was first released
      type: date
      empty: True
      nullable: True
    responsible_parties: # parties/persons/entities responsible for creation/maintenance of the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
experience:
  type: dict
  schema:
    additional_use_cases: # unconventional use cases
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    intended_users: # intended users of the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    intended_use_cases: # intended use cases of the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    prerequisites:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    prohibited_use_cases: # use cases for which the dataset should not be used for
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    unintended_use_cases: # unintended users of the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    pipeline_info:
      type: string
      empty: True
      nullable: True
structure:
  type: dict
  schema:
    columns: # a dictionary with column names as keys and a dictionary with description and datatype as values
      type: dict
      empty: True
      nullable: True
    data_splits:
      type: string
      empty: True
      nullable: True
    duplicate_data:
      type: string
      empty: True
      nullable: True
    instance_connections:
      type: string
      empty: True
      nullable: True
    instance_count: # overview: data_modality?
      type: integer
      empty: True
      nullable: True
    instance_patterns:
      type: dict
      empty: True
      nullable: True
    missing_data:
      type: string
      empty: True
      nullable: True
    sensitive_content:
      type: string
      empty: True
      nullable: True
creation:
  type: dict
  schema:
    annotator_demographics:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    annotation_guidelines:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    collection_date: # date when data collection started
      type: date
      empty: True
      nullable: True
    collection_method: # method used to collect the data
      type: string
      empty: True
      nullable: True
    collectors: # parties responsible for collection of the data
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    data_source: # overview: article_link? overview: file_repository_link?
      type: string
      empty: True
      nullable: True
    motivation: # motivation behind generating the dataset
      type: string
      empty: True
      nullable: True
    synthetic_feature: # generated synthetic features in the dataset as keys, description as values
      type: dict
      empty: True
      nullable: True
transformations:
  type: dict
  schema:
    cleaning: # cleaning methods used
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    labelling:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    parsing: # parsing methods used
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    processing:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    rating:
      type: integer
      empty: True
      nullable: True
    raw_data_available:
      type: boolean
      empty: True
      nullable: True
    sampling_filtering: # filtering methods used
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    validation: # validation methods used
      type: list
      schema:
        type: string
      empty: True
      nullable: True
examples:
  type: dict
  schema:
    code_examples:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    demos:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    errors: # instances which are error-prone and should be removed
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    further_documentation:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    null_feature_instance: # what a null instance looks like
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    outlier_instance: # outlier types which should be removed
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    publication_use_repo:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    project_use_repo:
      type: string
      empty: True
      nullable: True
    typical_instance:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
considerations:
  type: dict
  schema:
    ethical_reviews:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    licenses: # licenses
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    limitations: # any limitations associated with the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    policies:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    potential_skew_biases: # potential skew biases in the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    restrictions:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    risks: # any risks/warning to be careful about before using the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    skew_biases: # known skew biases in the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    trade_offs:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
additional_info:
  type: dict
  schema:
    background_info:
      type: string
      empty: True
      nullable: True
    citations:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    curators:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    funding: # parties/entities that provide funding to extend/maintain the dataset
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    maintenance:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    references:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    retention:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    updates: # last update performed
      type: string
      empty: True
      nullable: True
    versions: # latest versions being readily maintained
      type: list
      schema:
        type: string
      empty: True
      nullable: True
    wipeout:
      type: list
      schema:
        type: string
      empty: True
      nullable: True
'''

In [0]:
card_content = '''
overview:
  article_link: "https://doi.org/10.48550/arXiv.1506.06724"
  dataset_name: "BookCorpus"
  data_modality:
  domain: ["NLP", "MLM"]
  file_format: ".txt"
  file_name:
  file_repository_link: "https://yknzhu.wixsite.com/mbweb"
  keywords: ["NLP", "text", "books"]
  languages: ["en"]
  last_update_date:
  model_cards:
  release_date:
  responsible_parties: ["Yukun Zhu", "Ryan Kiros", "Richard Zemel", "Ruslan Salakhutdinov", "Raquel Urtasun", "Antonio Torralba", "Sanja Fidler"]
experience:
  additional_use_cases:
  intended_users:
  intended_use_cases: ["sentence embedding", "train neural networks to generate descriptive explanations for visual content"]
  prerequisites:
  prohibited_use_cases:
  unintended_use_cases:
  pipeline_info:
structure:
  columns:
  data_splits:
  duplicate_data: "out of 11038 books, only 7185 are unique; crosslisted across directories with 5 potential exceptions"
  instance_connections:
  instance_count:
  instance_patterns:
  missing_data: "98 empty files, 655 truncated files (less than 20000 words) including 291 shorter than 10000 words"
  sensitive_content:
creation:
  annotator_demographics:
  annotation_guidelines:
  collection_date:
  collection_method:
  collectors:
  data_source: "https://www.smashwords.com/"
  motivation:
  synthetic_feature:
transformations:
  cleaning:
  labelling:
  parsing:
  processing:
  rating:
  raw_data_available:
  sampling_filtering:
  validation:
examples:
  code_examples:
  demos:
  errors:
  further_documentation:
  null_feature_instance:
  outlier_instance:
  publication_use_repo:
  project_use_repo: "https://huggingface.co/models?dataset=dataset:bookcorpus"
  typical_instance:
considerations:
  ethical_reviews:
  licenses:
  limitations:
  policies:
  potential_skew_biases:
  restrictions:
  risks: ["copyright violations in content", "duplicated books", "genre skew", "potentially problematic content", "lopsided author contribution"]
  skew_biases: [ "genre skew, romance overrepresented", "noise introduced by presence of preamble, postscript, and copyright notes in some books" ]
  trade_offs:
additional_info:
  background_info:
  citations: [
      "@InProceedings{Zhu_2015_ICCV,
                   title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
                   author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
                   booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
                   month = {December},
                   year = {2015}
                 }",
  ]
  curators:
  funding: ["Google", "Samsung", "NSERC", "CIFAR", "ONR"]
  maintenance:
  references:
  retention:
  updates:
  versions:
  wipeout:
'''

In [0]:
# from cerberus import Validator
# import yaml

# schema = yaml.safe_load(schema_content)
# card = yaml.safe_load(card_content)

# v = Validator(schema)
# v.validate(card)
# v.errors
# v.document

In [0]:
import yaml

from cerberus import Validator
from rdflib import BNode, Graph, Literal, Namespace, URIRef
from rdflib.namespace import DC, FOAF, OWL, RDF, RDFS
from urllib.parse import quote


class Card:
  def __init__(self, card_type: str = 'data', schema_content: str = None, card_content: str = None, schema_loc: str = None, card_loc: str = None):
    if card_type in ['data', 'dataspace', 'model']:
      self.card_type = card_type
    else:
      raise Exception('card_type should be one of [`data`, `dataspace`, `model`]')
    
    self.schema_content = schema_content
    self.schema_loc = schema_loc

    self.card_content = card_content
    self.card_loc = card_loc

    self.setup()


  def _read_yaml(self, loc):
    with open(loc, 'r') as infile:
      try:
        yaml = yaml.safe_load(infile)
        print('yaml loaded from %s' % file_loc)
      except IOError as exc:
        msg = 'could not load yaml from ' + file_loc
        print(msg, exc)
      except yaml.YAMLError as exc:
        msg = 'could not parse yaml from ' + file_loc
        print(msg, exc)
    
    return yaml


  def _validate_schema(self, document, schema):
    v = Validator(schema)
    if v.validate(document):
      return v.document
    else:
      print(v.errors)
      raise Exception('Validation failed.')


  def setup(self):
    if self.schema_loc:
      self.schema_yaml = self._read_yaml(self.schema_loc)
    elif self.schema_content:
      try:
        self.schema_yaml = yaml.safe_load(self.schema_content)
      except yaml.YAMLError as exc:
        msg = 'could not parse yaml from provided file string'
        print(msg, exc)

    if self.card_loc:
      self.card_yaml = self._read_yaml(self.card_loc)
    elif self.card_content:
      try:
        self.card_yaml = yaml.safe_load(self.card_content)
      except yaml.YAMLError as exc:
        msg = 'could not parse yaml from provided file string'
        print(msg, exc)

    self.document = self._validate_schema(self.card_yaml, self.schema_yaml)


  def _initialize_graph(self):
    DB = Namespace('https://dbpedia.org/')

    g = Graph()
    g.bind('dbpedia', DB)

    # add parent level Card class
    g.add((URIRef(DB['Card']), RDF.type, RDFS.Class))
    g.add((URIRef(DB['Card']), RDFS.comment, Literal('Parent-level class for a Card.')))
    g.add((URIRef(DB['Card']), RDFS.label, Literal('Card')))

    if self.card_type == 'data':
      g.add((URIRef(DB['DataCard']), RDF.type, RDFS.Class))
      g.add((URIRef(DB['DataCard']), RDFS.comment, Literal('Class for a DataCard.')))
      g.add((URIRef(DB['DataCard']), RDFS.label, Literal('DataCard')))
      g.add((URIRef(DB['DataCard']), RDFS.subClassOf, URIRef(DB['Card'])))
    elif self.card_type == 'dataspace':
      g.add((URIRef(DB['DataspaceCard']), RDF.type, RDFS.Class))
      g.add((URIRef(DB['DataspaceCard']), RDFS.comment, Literal('Class for a DataspaceCard.')))
      g.add((URIRef(DB['DataspaceCard']), RDFS.label, Literal('DataspaceCard')))
      g.add((URIRef(DB['DataspaceCard']), RDFS.subClassOf, URIRef(DB['Card'])))
    elif self.card_type == 'model':
      g.add((URIRef(DB['ModelCard']), RDF.type, RDFS.Class))
      g.add((URIRef(DB['ModelCard']), RDFS.comment, Literal('Class for a ModelCard.')))
      g.add((URIRef(DB['ModelCard']), RDFS.label, Literal('ModelCard')))
      g.add((URIRef(DB['ModelCard']), RDFS.subClassOf, URIRef(DB['Card'])))

    return g


  def make_graph(self):
    DB = Namespace('https://dbpedia.org/')

    self.g = self._initialize_graph()
    for sub, obj in self.document.items():
      self.g.add(  # all top subjects are a type of ObjectProperty
        (
          URIRef(DB[sub]),
          RDF.type,
          OWL.ObjectProperty,
        )
      )
      self.g.add( # all top subjects have a label equivalent to their string literal
        (
          URIRef(DB[sub]),
          RDFS.label,
          Literal(sub),
        )
      )

      if self.card_type == 'data':
        self.g.add( # all top subjects have DataCard class as their domain
          (
            URIRef(DB[sub]),
            RDFS.domain,
            URIRef(DB['DataCard'])
          )
        )
      elif self.schema_type == 'dataspace':
        self.g.add( # all top subjects have DataspaceCard class as their domain
          (
            URIRef(DB[sub]),
            RDFS.domain,
            URIRef(DB['DataspaceCard'])
          )
        )
      elif self.schema_type == 'model':
        self.g.add( # all top subjects have ModelCard class as their domain
          (
            URIRef(DB[sub]),
            RDFS.domain,
            URIRef(DB['ModelCard'])
          )
        )

      for inner_sub, inner_obj in obj.items():
        self.g.add(  # all inner subjects are a type of ObjectProperty
          (
            URIRef(DB[inner_sub]),
            RDF.type,
            OWL.ObjectProperty,
          )
        )
        self.g.add( # all inner subjects have a label equivalent to their string literal
          (
            URIRef(DB[inner_sub]),
            RDFS.label,
            Literal(inner_sub),
          )
        )

In [0]:
sample_data_card = Card(card_type='data', schema_content=schema_content, card_content=card_content)

In [0]:
sample_data_card.make_graph()

In [0]:
print(sample_data_card.g.serialize(format='ttl'))