In [38]:
from pyspark.sql import SparkSession
from JRDBDataParsingTools.parse import load_schema, parse_line, create_pyspark_schema
from pyspark.sql import functions as f
import pandas as pd
from typing import List
import functools


# Set display options
pd.set_option("display.max_columns", None)  # Ensures all columns are displayed
pd.set_option("display.expand_frame_repr", False)  # Disable wrapping of the DataFrame


spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

23/12/16 14:30:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Add required schemas and roles to the database.

```sql
CREATE SCHEMA IF NOT EXISTS bronze
    AUTHORIZATION myuser;

COMMENT ON SCHEMA public
    IS 'bronze layer schema';

GRANT ALL ON SCHEMA public TO myuser;
```

In [39]:
jdbc_common_options = {
    "url": "jdbc:postgresql://localhost:5432/jrdb",
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver"
}

In [13]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/SED*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/SED.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/SED.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.sed")
    .mode("overwrite")
    .save()
)

23/12/16 13:20:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [41]:
(
    spark.read.format("binaryFile")
    .load("downloads/KYI*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/KYI.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/KYI.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.kyi")
    .mode("overwrite")
    .save()
)

                                                                                

In [15]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/UKC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/UKC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/UKC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.ukc")
    .mode("overwrite")
    .save()
)

                                                                                

In [16]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/BAC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/BAC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/BAC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.bac")
    .mode("overwrite")
    .save()
)

                                                                                

In [17]:
df = (
    spark.read.format("binaryFile")
    .load("jrdb-data/OZ*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/OZ.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/OZ.yaml")))
)

(
    df
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.oz")
    .mode("overwrite")
    .save()
)

                                                                                

In [1]:
from JRDBDataParsingTools.download import download_and_extract_files


# Assume `webpage_url` is the URL of the webpage you provided
webpage_url = "http://www.jrdb.com/member/datazip/Kyi/index.html"

# Your actual credentials for basic authentication
username = "23120002"
password = "83030141"

# The directory where you want to download the files
download_dir = "downloads"

download_and_extract_files(webpage_url, username, password, download_dir)

{"event": "Downloading and extracting files from http://www.jrdb.com/member/datazip/Kyi/index.html", "level": "info", "timestamp": "2023-12-16T05:55:04.165962Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Response status code: 200", "level": "info", "timestamp": "2023-12-16T05:55:04.811757Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Parsing webpage", "level": "info", "timestamp": "2023-12-16T05:55:04.812876Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Processing year files", "level": "info", "timestamp": "2023-12-16T05:55:05.054696Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Extracted files from KYI_2022.zip", "level": "info", "timestamp": "2023-12-16T05:55:17.483159Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Extracted files from KYI_2021.zip", "level": "info", "timestamp": "2023-12-16T05:55:30.329634Z", "logger": "JRDBDataParsingTools.download"}
{"event": "Extracted files from KYI_2020.zip", "level": "info", "timestamp": "