In [10]:
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col

In [5]:
jars = """
/opt/workspace/utils/jars/spark-sql-kafka-0-10_2.12-3.4.0.jar,
/opt/workspace/utils/jars/kafka-clients-3.3.2.jar,
/opt/workspace/utils/jars/spark-token-provider-kafka-0-10_2.12-3.4.0.jar,
/opt/workspace/utils/jars/commons-pool2-2.11.1.jar,
/opt/workspace/utils/jars/delta-core_2.12-2.4.0.jar,
/opt/workspace/utils/jars/delta-storage-2.4.0.jar
"""

builder = (
    SparkSession
    .builder
    .master("spark://spark:7077")
    .appName("cleaned")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars", jars)
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [17]:
df = spark.read.format('delta').load('/opt/workspace/data/raw/test/')

In [38]:
schema = """
payload STRUCT<
    before:STRING,
    after:STRUCT<
        id: STRING, 
        first_name: STRING, 
        last_name: STRING, 
        email: STRING
    >, 
    op: STRING
>
"""

In [39]:
(
    df
    .withColumn('json', from_json(col('value'), schema))
    .withColumn('id', col('json.payload.after.id'))
    .withColumn('first_name', col('json.payload.after.first_name'))
    .withColumn('last_name', col('json.payload.after.last_name'))
    .withColumn('email', col('json.payload.after.email'))
    .withColumn('op', col('json.payload.op'))
    .select(
        'id',
        'first_name',
        'last_name',
        'email',
        'op'
    )
)

DataFrame[id: string, first_name: string, last_name: string, email: string, op: string]

In [13]:
df.show()

+--------------------+--------------------+--------------------+
|                 key|               value|             new_col|
+--------------------+--------------------+--------------------+
|{"schema":{"type"...|{"schema":{"type"...|{{null, {"id":100...|
|{"schema":{"type"...|{"schema":{"type"...|{{null, {"id":100...|
|{"schema":{"type"...|{"schema":{"type"...|{{null, {"id":100...|
|{"schema":{"type"...|{"schema":{"type"...|{{null, {"id":100...|
|{"schema":{"type"...|{"schema":{"type"...|{{null, {"id":100...|
+--------------------+--------------------+--------------------+



In [3]:
spark.stop()