In [7]:
from pyspark.sql import SparkSession
from JRDBDataParsingTools.parse import decode_cp932, load_schema, parse_line
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql import functions as f
import functools
from pyspark.sql import Row
import pandas as pd

# Set display options
pd.set_option("display.max_columns", None)  # Ensures all columns are displayed
pd.set_option("display.expand_frame_repr", False)  # Disable wrapping of the DataFrame


spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

In [8]:
schema_fields = StructType(
    [
        StructField(field.name, StringType(), True)
        for field in load_schema("schemas/SED.yaml")
    ]
)

df = (
    spark.read.format("binaryFile")
    .load("jrdb-data/SED*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/SED.yaml")))
    .toDF(schema_fields)
    .withColumn("input_file_name", f.input_file_name())
)

df.show()

                                                                                

+-------------------+-------------+-------------+-------------+-------------+----+-------------------------+-------------------+------------------+---------------+--------------------------------------+----------------------------+----------------------------+-------------------+---------------+---------------+---------------+---------------+-------------------+-------------------+---------------+-----------------------+-----------+---------------+-------------+-----------+-------------+---------------+---------------------+-----------------------+---------------------+-------------------+---------------------+---------------------+-------------------+---------------------+-------------------+---------------------+---------------------+---------------------+---------------------+-------------------------+---------------------------+---------------------------+-------------------------+-------------------------+---------------------------+-----------------------+-----------------------+

In [9]:
url = "jdbc:postgresql://localhost:5432/mydatabase"
properties = {
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver",
}

(
    df.write.format("jdbc")
    .option("url", url)
    # add schema to table name
    .option("dbtable", "jrdb.sed")
    .options(**properties)
    .mode("overwrite")
    .save()
)

                                                                                