In [2]:
from linkml_runtime.utils.schemaview import SchemaView
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import os

In [3]:


def map_range_to_spark_type(slot_range: str, multivalued: bool = False):
    """Map LinkML range to Spark type. Extend this as needed."""
    type_map = {
        "string": StringType(),
        "float": FloatType(),
        "UUID": StringType(),  # Spark doesn't have a UUIDType, use StringType
    }
    spark_type = type_map.get(slot_range, StringType())
    return spark_type

def get_spark_schema_for_class(sv, class_name: str):
    slots = sv.class_induced_slots(class_name)
    fields = [
        StructField(slot.name, map_range_to_spark_type(slot.range, slot.multivalued), not slot.required)
        for slot in slots
    ]
    return StructType(fields)

# Load the LinkML schema
sv = SchemaView("cluster-schema.yaml")

# Initialize Spark with Delta support
builder = SparkSession.builder \
    .appName("EcommerceSchema to Delta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# ---------------------
# Create `Cluster` table
cluster_schema = get_spark_schema_for_class(sv, "Cluster")
cluster_data = [("c001", "Cluster generated using protocol X", "prot001")]
df_cluster = spark.createDataFrame(cluster_data, schema=cluster_schema)
df_cluster.write.format("delta").mode("overwrite").save("/tmp/delta/Cluster")

# ---------------------
# Create `Cluster_X_Protein` table
cxp_schema = get_spark_schema_for_class(sv, "Cluster_X_Protein")
cxp_data = [("c001", "p001", 0.98), ("c001", "p002", 0.75)]
df_cxp = spark.createDataFrame(cxp_data, schema=cxp_schema)
df_cxp.write.format("delta").mode("overwrite").save("/tmp/delta/Cluster_X_Protein")


:: loading settings :: url = jar:file:/opt/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-abe2678b-90a5-419e-bf8f-ab47db38d9a1;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 328ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0  

In [4]:
# Read the Cluster Delta table
df_cluster = spark.read.format("delta").load("/tmp/delta/Cluster")
print("=== Cluster Schema ===")
df_cluster.printSchema()

print("=== Cluster Data ===")
df_cluster.show(truncate=False)

# Read the Cluster_X_Protein Delta table
df_cxp = spark.read.format("delta").load("/tmp/delta/Cluster_X_Protein")
print("=== Cluster_X_Protein Schema ===")
df_cxp.printSchema()

print("=== Cluster_X_Protein Data ===")
df_cxp.show(truncate=False)


=== Cluster Schema ===
root
 |-- cluster_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- protocol_id: string (nullable = true)

=== Cluster Data ===


                                                                                

+----------+----------------------------------+-----------+
|cluster_id|description                       |protocol_id|
+----------+----------------------------------+-----------+
|c001      |Cluster generated using protocol X|prot001    |
+----------+----------------------------------+-----------+

=== Cluster_X_Protein Schema ===
root
 |-- cluster_id: string (nullable = true)
 |-- protein_id: string (nullable = true)
 |-- score: float (nullable = true)

=== Cluster_X_Protein Data ===


                                                                                

+----------+----------+-----+
|cluster_id|protein_id|score|
+----------+----------+-----+
|c001      |p001      |0.98 |
|c001      |p002      |0.75 |
+----------+----------+-----+

