In [1]:
import functools
import os
from typing import List

from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from JRDBDataParsingTools.data_schema import load_schema, create_pyspark_schema
from JRDBDataParsingTools.data_parser import parse_line
from JRDBDataParsingTools.file_downloader import download_and_extract_files
from JRDBDataParsingTools.structured_logger import logger

In [2]:
%load_ext dotenv

# Download files from the web

In [3]:
# JRDB credentials
username = os.getenv("JRDB_USERNAME")
password = os.getenv("JRDB_PASSWORD")
# The directory where you want to download the files
# Must be an absolute path
download_dir = "/Users/hankehly/Projects/JRDBDataParsingTools/downloads"

In [4]:
target_dataset_urls = [
    # Taken from
    # http://www.jrdb.com/member/dataindex.html
    "http://www.jrdb.com/member/datazip/Kab/index.html",
    "http://www.jrdb.com/member/datazip/Bac/index.html",
    "http://www.jrdb.com/member/datazip/Kyi/index.html",
    "http://www.jrdb.com/member/datazip/Ukc/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index2.html"  # "OW",
    "http://www.jrdb.com/member/datazip/Ou/index.html",
    "http://www.jrdb.com/member/datazip/Ot/index.html",
    "http://www.jrdb.com/member/datazip/Ov/index.html",
    "http://www.jrdb.com/member/datazip/Cyb/index.html",
    "http://www.jrdb.com/member/datazip/Cha/index.html",
    "http://www.jrdb.com/member/datazip/Sed/index.html",
    "http://www.jrdb.com/member/datazip/Skb/index.html",
    "http://www.jrdb.com/member/datazip/Tyb/index.html",
    "http://www.jrdb.com/member/datazip/Hjc/index.html",
]

for webpage_url in target_dataset_urls:
    download_and_extract_files(webpage_url, username, password, download_dir)

{"event": "Downloading and extracting files from http://www.jrdb.com/member/datazip/Kab/index.html", "level": "info", "timestamp": "2023-12-16T07:55:25.444945Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/KAB_2022.zip", "level": "info", "timestamp": "2023-12-16T07:55:26.553216Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/KAB_2021.zip", "level": "info", "timestamp": "2023-12-16T07:55:26.848891Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/KAB_2020.zip", "level": "info", "timestamp": "2023-12-16T07:55:27.131184Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/KAB_2019.zip", "level": "info", "timestamp": "2023-12-16T07:55:27.429306Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jr

# Import files into Postgres

In [None]:
spark = (
    SparkSession
    .builder
    .config("spark.jars", "postgresql-42.7.1.jar")
    .getOrCreate()
)

In [None]:
jdbc_common_options = {
    "url": "jdbc:postgresql://localhost:5432/jrdb",
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver"
}

In [None]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/SED*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/SED.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/SED.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.sed")
    .mode("overwrite")
    .save()
)

In [None]:
(
    spark.read.format("binaryFile")
    .load("downloads/KYI*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/KYI.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/KYI.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.kyi")
    .mode("overwrite")
    .save()
)

In [None]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/UKC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/UKC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/UKC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.ukc")
    .mode("overwrite")
    .save()
)

In [None]:
(
    spark.read.format("binaryFile")
    .load("jrdb-data/BAC*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/BAC.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/BAC.yaml")))
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.bac")
    .mode("overwrite")
    .save()
)

In [None]:
df = (
    spark.read.format("binaryFile")
    .load("jrdb-data/OZ*.txt")
    .select("content")
    .rdd.flatMap(lambda x: x[0].splitlines())
    .map(functools.partial(parse_line, schema=load_schema("schemas/OZ.yaml")))
    .toDF(create_pyspark_schema(load_schema("schemas/OZ.yaml")))
)

(
    df
    .withColumn("input_file_name", f.input_file_name())
    .write.format("jdbc")
    .options(**jdbc_common_options)
    .option("dbtable", "bronze.oz")
    .mode("overwrite")
    .save()
)