In [1]:
from linkml_runtime.utils.schemaview import SchemaView

# Load schema
sv = SchemaView("name-schema.yaml")

# Get all non-abstract classes (use .abstract instead of .get("abstract"))
concrete_classes = [cls for cls in sv.all_classes().values() if not cls.abstract]
print([cls.name for cls in concrete_classes])  # should show: ['Name', 'Identifier']


['Name', 'Identifier']


In [2]:
from pyspark.sql.types import *

def map_range_to_spark_type(slot_range: str, multivalued: bool = False):
    type_mapping = {
        "string": StringType(),
        "float": FloatType(),
        "integer": IntegerType(),
        "boolean": BooleanType(),
        "UUID": StringType(),
        "uriorcurie": StringType()
    }
    data_type = type_mapping.get(slot_range, StringType())
    return ArrayType(data_type) if multivalued else data_type


In [3]:
import os

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaTableFromLinkML") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

def build_schema_and_create_table(class_name: str, output_path: str):
    cls = sv.get_class(class_name)
    attrs = sv.class_slots(class_name)
    
    fields = []
    for attr in attrs:
        slot = sv.induced_slot(attr, class_name)
        dtype = map_range_to_spark_type(slot.range or "string", slot.multivalued)
        nullable = not slot.required
        fields.append(StructField(attr, dtype, nullable))
    
    schema = StructType(fields)
    
    # Create empty DataFrame with schema and save as Delta table
    df = spark.createDataFrame([], schema)
    delta_path = os.path.join(output_path, class_name.lower())
    df.write.format("delta").mode("overwrite").save(delta_path)
    
    print(f"Delta table for {class_name} created at {delta_path}")
    return delta_path


:: loading settings :: url = jar:file:/opt/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8bb7bdf9-294b-4f07-8dbc-54ff916fc402;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 276ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0  

In [4]:
base_path = "/tmp/delta"  # change if needed
name_path = build_schema_and_create_table("Name", base_path)
identifier_path = build_schema_and_create_table("Identifier", base_path)


25/05/20 15:46:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Delta table for Name created at /tmp/delta/name




Delta table for Identifier created at /tmp/delta/identifier


                                                                                

In [5]:
base_path = "/tmp/delta"
created_table_paths = {}

# Iterate over all non-abstract (concrete) classes
for cls in sv.all_classes().values():
    if not cls.abstract:
        class_name = cls.name
        path = build_schema_and_create_table(class_name, base_path)
        created_table_paths[class_name] = path

                                                                                

Delta table for Name created at /tmp/delta/name




Delta table for Identifier created at /tmp/delta/identifier


                                                                                

In [7]:
# Name dummy data
name_data = [
    ("Protein A", "uuid-1", "Heat-inducible transcription repressor HrcA", "NCBI"),
    ("Protein B", "uuid-2", "Uncharacterized protein 002R", "TrEMBL")
]
name_schema = StructType([
    StructField("name", StringType(), False),
    StructField("entity_id", StringType(), False),
    StructField("description", StringType(), True),
    StructField("source", StringType(), True)
])
name_df = spark.createDataFrame(name_data, name_schema)
name_df.write.format("delta").mode("append").save(name_path)

# Identifier dummy data
identifier_data = [
    ("uuid-1", "UniProt:Q8KCD6", "Protein A ID", "UniProt"),
    ("uuid-2", "EC:5.2.3.14", "Protein B ID", "NCBI")
]
identifier_schema = StructType([
    StructField("entity_id", StringType(), False),
    StructField("identifier", StringType(), False),
    StructField("description", StringType(), True),
    StructField("source", StringType(), True)
])
identifier_df = spark.createDataFrame(identifier_data, identifier_schema)
identifier_df.write.format("delta").mode("append").save(identifier_path)


                                                                                

In [8]:
def show_table(path):
    print(f"\n📄 Schema for table at {path}:")
    df = spark.read.format("delta").load(path)
    df.printSchema()
    print("📊 Data:")
    df.show(truncate=False)

# Show all created tables
for class_name, path in created_table_paths.items():
    show_table(path)



📄 Schema for table at /tmp/delta/name:
root
 |-- description: string (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)

📊 Data:


                                                                                

+-------------------------------------------+---------+---------+------+
|description                                |entity_id|name     |source|
+-------------------------------------------+---------+---------+------+
|Heat-inducible transcription repressor HrcA|uuid-1   |Protein A|NCBI  |
|Uncharacterized protein 002R               |uuid-2   |Protein B|TrEMBL|
+-------------------------------------------+---------+---------+------+


📄 Schema for table at /tmp/delta/identifier:
root
 |-- description: string (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- source: string (nullable = true)

📊 Data:


                                                                                

+------------+---------+--------------+-------+
|description |entity_id|identifier    |source |
+------------+---------+--------------+-------+
|Protein A ID|uuid-1   |UniProt:Q8KCD6|UniProt|
|Protein B ID|uuid-2   |EC:5.2.3.14   |NCBI   |
+------------+---------+--------------+-------+

