In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from linkml_runtime.utils.schemaview import SchemaView
from delta import configure_spark_with_delta_pip

# ---------- Configuration ----------
schema_file = "cluster-schema.yaml"
output_base_path = "/tmp/delta"

# ---------- Range-to-Type Mapper ----------
def map_range_to_spark_type(slot_range: str, multivalued: bool = False):
    type_map = {
        "string": StringType(),
        "float": FloatType(),
        "UUID": StringType(),
    }
    return type_map.get(slot_range, StringType())

# ---------- Schema Generator ----------
def get_spark_schema_for_class(sv, class_name: str):
    slots = sv.class_induced_slots(class_name)
    fields = [
        StructField(slot.name, map_range_to_spark_type(slot.range, slot.multivalued), not slot.required)
        for slot in slots
    ]
    return StructType(fields)

# ---------- Initialize Spark ----------
builder = SparkSession.builder \
    .appName("LinkML to Delta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# ---------- Load Schema ----------
sv = SchemaView(schema_file)
all_classes = sv.all_classes()

# ---------- Iterate and Create Delta Tables ----------
for class_name in all_classes:
    schema = get_spark_schema_for_class(sv, class_name)
    class_obj = sv.get_class(class_name)

    # You can generate dummy or empty data for each table
    print(f"Creating Delta table for: {class_name}")
    
    # Create empty DataFrame for now
    df = spark.createDataFrame([], schema=schema)

    # Save as Delta table
    delta_path = f"{output_base_path}/{class_name}"
    df.write.format("delta").mode("overwrite").save(delta_path)

    print(f"✔️  Delta table created at: {delta_path}")


:: loading settings :: url = jar:file:/opt/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8efc6112-f271-4faf-8be2-1311a3220a00;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 261ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

Creating Delta table for: Cluster


25/05/20 15:06:08 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

✔️  Delta table created at: /tmp/delta/Cluster
Creating Delta table for: Cluster_X_Protein




✔️  Delta table created at: /tmp/delta/Cluster_X_Protein


                                                                                

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from linkml_runtime.utils.schemaview import SchemaView
from delta import configure_spark_with_delta_pip

# ---------- Configuration ----------
schema_file = "cluster-schema.yaml"
output_base_path = "/tmp/delta"

# ---------- Map LinkML types to Spark types ----------
def map_range_to_spark_type(slot_range: str, multivalued: bool = False):
    type_map = {
        "string": StringType(),
        "float": FloatType(),
        "UUID": StringType(),
    }
    return type_map.get(slot_range, StringType())

# ---------- Get Spark schema from LinkML class ----------
def get_spark_schema_for_class(sv, class_name: str):
    slots = sv.class_induced_slots(class_name)
    fields = [
        StructField(slot.name, map_range_to_spark_type(slot.range, slot.multivalued), not slot.required)
        for slot in slots
    ]
    return StructType(fields), slots

# ---------- Create dummy data row ----------
def generate_dummy_row(slots):
    dummy_row = []
    for slot in slots:
        if slot.range == "UUID":
            dummy_row.append(f"{slot.name}_001")
        elif slot.range == "float":
            dummy_row.append(0.5)
        else:
            dummy_row.append(f"dummy_{slot.name}")
    return tuple(dummy_row)

# ---------- Initialize Spark ----------
builder = SparkSession.builder \
    .appName("LinkML Delta Tables") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# ---------- Load LinkML Schema ----------
sv = SchemaView(schema_file)
all_classes = sv.all_classes()

# ---------- Process Each Class ----------
for class_name in all_classes:
    print(f"\n▶️ Processing class: {class_name}")
    schema, slots = get_spark_schema_for_class(sv, class_name)

    # Show inferred schema
    print("🔍 Schema:")
    for field in schema.fields:
        print(f"  - {field.name}: {field.dataType}")

    # Create DataFrame with dummy row
    dummy_row = generate_dummy_row(slots)
    df = spark.createDataFrame([dummy_row], schema=schema)

    # Save to Delta
    delta_path = f"{output_base_path}/{class_name}"
    df.write.format("delta").mode("overwrite").save(delta_path)
    print(f"✅ Delta table created at {delta_path}")

    # Read and show table content
    print("📄 Table Content:")
    df_read = spark.read.format("delta").load(delta_path)
    df_read.show()


25/05/20 15:16:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.



▶️ Processing class: Cluster
🔍 Schema:
  - cluster_id: StringType()
  - description: StringType()
  - protocol_id: StringType()


                                                                                

✅ Delta table created at /tmp/delta/Cluster
📄 Table Content:


                                                                                

+--------------+-----------------+-----------------+
|    cluster_id|      description|      protocol_id|
+--------------+-----------------+-----------------+
|cluster_id_001|dummy_description|dummy_protocol_id|
+--------------+-----------------+-----------------+


▶️ Processing class: Cluster_X_Protein
🔍 Schema:
  - cluster_id: StringType()
  - protein_id: StringType()
  - score: FloatType()


                                                                                

✅ Delta table created at /tmp/delta/Cluster_X_Protein
📄 Table Content:




+--------------+--------------+-----+
|    cluster_id|    protein_id|score|
+--------------+--------------+-----+
|cluster_id_001|protein_id_001|  0.5|
+--------------+--------------+-----+



                                                                                