In [3]:
from pyspark.sql import SparkSession
from JRDBDataParsingTools.parse import load_schema, parse_line, create_pyspark_schema
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql import functions as f
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType
from typing import List
import functools

# Set display options
pd.set_option("display.max_columns", None)  # Ensures all columns are displayed
pd.set_option("display.expand_frame_repr", False)  # Disable wrapping of the DataFrame


spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

Add required schemas and roles to the database.

```sql
CREATE SCHEMA IF NOT EXISTS bronze
    AUTHORIZATION myuser;

COMMENT ON SCHEMA public
    IS 'bronze layer schema';

GRANT ALL ON SCHEMA public TO myuser;
```

In [2]:
jdbc_common_options = {
    "url": "jdbc:postgresql://localhost:5432/jrdb",
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver"
}

In [10]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/SED*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/SED.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/SED.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.sed")
    .mode("overwrite")
    .save()
)

                                                                                

In [13]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/KYI*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/KYI.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/KYI.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.kyi")
    .mode("overwrite")
    .save()
)

                                                                                

In [14]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/UKC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/UKC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/UKC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.ukc")
    .mode("overwrite")
    .save()
)

                                                                                

In [15]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/BAC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/BAC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/BAC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.bac")
    .mode("overwrite")
    .save()
)

                                                                                

In [10]:
df = (
    spark.read.format("binaryFile")
    .load("jrdb-data/OZ*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/OZ.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/OZ.yaml")))
)

(
    df
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.oz")
    .mode("overwrite")
    .save()
)

                                                                                