In [47]:
from pyspark.sql import SparkSession
from JRDBDataParsingTools.parse import decode_cp932, load_schema, parse_text
from JRDBDataParsingTools.parse_SED import parse_line
from pyspark.sql.types import StringType, BinaryType, StructType, StructField
from pyspark.sql import functions as f
import functools
from pyspark.sql import Row
import pandas as pd

# Set display options
pd.set_option('display.max_columns', None)  # Ensures all columns are displayed
pd.set_option('display.expand_frame_repr', False)  # Disable wrapping of the DataFrame


spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

In [51]:
def parse_line3(line, schema):
    parsed_fields = []
    for field in schema:
        start = field.relative - 1  # Adjust for zero-based indexing
        end = start + field.byte_length
        s = line[start:end]
        parsed_field = s.decode("cp932").strip()
        parsed_fields.append(parsed_field)
    return Row(*parsed_fields)


schema_fields = StructType(
    [
        StructField(field.name, StringType(), True)
        for field in load_schema("schemas/SED.yaml")
    ]
)

df1 = (
    spark.read.format("binaryFile")
    .load("SED040828.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line3, schema=load_schema("schemas/SED.yaml")))
    .toDF(schema_fields)
)

df1.show()

+-------------------+-------------+-------------+-------------+-------------+----+-------------------------+-------------------+------------------+---------------+--------------------------------------+----------------------------+----------------------------+-------------------+---------------+---------------+---------------+---------------+-------------------+-------------------+---------------+-----------------------+-----------+---------------+-------------+-----------+-------------+---------------+---------------------+-----------------------+---------------------+-------------------+---------------------+---------------------+-------------------+---------------------+-------------------+---------------------+---------------------+---------------------+---------------------+-------------------------+---------------------------+---------------------------+-------------------------+-------------------------+---------------------------+-----------------------+-----------------------+

In [2]:
df = (
    spark.read.format("binaryFile")
    .load("jrdb-data/*.txt")
    .select("content")
    .rdd
    .flatMap(lambda x: x[0].splitlines())
    .map(decode_cp932)
    .toDF(StringType())
    .withColumn("input_file_name", f.input_file_name())
    .withColumnRenamed("value", "text")
    .select("input_file_name", "text")
)

                                                                                

In [5]:
df_sed = df.where(df.input_file_name.rlike("SED\d{6}.txt"))
df_sed = parse_text(df_sed, schema=load_schema("schemas/SED.yaml"), parse_column_name="text")
df_sed.drop("text").write.mode("overwrite").csv("SED.csv", header=True)

                                                                                

In [7]:
url = "jdbc:postgresql://localhost:5432/mydatabase"
properties = {"user": "myuser", "password": "mypassword", "driver": "org.postgresql.Driver"}

df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", "SED") \
    .options(**properties) \
    .mode("overwrite") \
    .save()

                                                                                