In [1]:
import pandas as pd
import pyspark.sql.functions as F
from loguru import logger
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import StringType, StructField, StructType

from jhra.jrdb import create_pyspark_schema, load_schema, parse_line
from jhra.settings import settings

# Download Data

In [None]:
urls = [
    # Taken from http://www.jrdb.com/member/dataindex.html
    # Comment out the ones you don't want to download.
    # Downloading all of them will take about ?
    "http://www.jrdb.com/member/datazip/Kab/index.html",
    "http://www.jrdb.com/member/datazip/Bac/index.html",
    "http://www.jrdb.com/member/datazip/Kyi/index.html",
    "http://www.jrdb.com/member/datazip/Ukc/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index2.html",  # OW data
    "http://www.jrdb.com/member/datazip/Ou/index.html",
    "http://www.jrdb.com/member/datazip/Ot/index.html",
    "http://www.jrdb.com/member/datazip/Ov/index.html",
    "http://www.jrdb.com/member/datazip/Cyb/index.html",
    "http://www.jrdb.com/member/datazip/Cha/index.html",
    "http://www.jrdb.com/member/datazip/Sed/index.html",
    "http://www.jrdb.com/member/datazip/Skb/index.html",
    "http://www.jrdb.com/member/datazip/Tyb/index.html",
    "http://www.jrdb.com/member/datazip/Hjc/index.html",
]

# for webpage_url in urls:
#     download_and_extract_files(
#         webpage_url, username, password, JRDB_DATA_DIR, start_date=datetime.date(2023, 12, 1)
#     )

# Full Load

In [2]:
spark = (
    SparkSession.builder.config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "6g")
    .config("spark.sql.warehouse.dir", settings.SPARK_WAREHOUSE_DIR)
    .config("javax.jdo.option.ConnectionURL", settings.HIVE_METASTORE_URL)
    .config("javax.jdo.option.ConnectionDriverName", "org.postgresql.Driver")
    .config("javax.jdo.option.ConnectionUserName", "admin")
    .config("javax.jdo.option.ConnectionPassword", "admin")
    .config("datanucleus.schema.autoCreateTables", "true")
    .config("hive.metastore.schema.verification", "false")
    .config("spark.driver.extraClassPath", settings.POSTGRES_JDBC_JAR)
    .enableHiveSupport()
    .getOrCreate()
)

# Drop schema if exists
logger.info(f"Dropping schema {settings.SCHEMA_NAME}")
spark.sql(f"DROP SCHEMA IF EXISTS {settings.SCHEMA_NAME} CASCADE")

# Create schema
logger.info(f"Creating schema {settings.SCHEMA_NAME}")
spark.sql(f"CREATE SCHEMA {settings.SCHEMA_NAME}")

datasets = [
    # fmt: off
    "KAB", "BAC", "KYI", "UKC",
    "OZ",  "OW",  "OU",  "OT",
    "OV",  "CYB", "CHA", "SKB",
    "SRB", "HJC", "SED", "TYB",
    # fmt: on
]

for ds in datasets:
    data_path = (settings.JRDB_DATA_DIR / f"{ds}[0-9][0-9][0-9][0-9][0-9][0-9].txt").as_posix()
    table_name = f"{settings.SCHEMA_NAME}.raw_jrdb__{ds.lower()}"
    jrdb_fields = load_schema(settings.SCHEMAS_DIR / f"{ds}.yaml")
    parsed_struct = create_pyspark_schema(jrdb_fields)

    logger.info(f"Loading JRDB dataset {ds} from {data_path}")

    base_df = (
        spark.read.format("binaryFile")
        .load(data_path)
        .withColumn("file_name", F.element_at(F.split(F.col("path"), "/"), -1))
        .withColumn("sha256", F.sha2(F.col("content"), 256))
        .select("file_name", "sha256", "content")
    )

    rdd = base_df.rdd.flatMap(
        lambda r: (
            Row(file_name=r.file_name, sha256=r.sha256, **parse_line(line, jrdb_fields).asDict())
            for line in r.content.splitlines()
            if line  # skip empty lines
        )
    )

    final_schema = StructType([
        StructField("file_name", StringType(), False),
        StructField("sha256", StringType(), False),
        *parsed_struct.fields
    ])

    parsed_df = spark.createDataFrame(rdd, schema=final_schema)
    parsed_df.write.mode("overwrite").saveAsTable(table_name)

spark.sql(f"SHOW TABLES IN {settings.SCHEMA_NAME}").show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/10/18 02:43:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[32m2025-10-18 02:43:25.753[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mDropping schema jhra_raw[0m


25/10/18 02:43:27 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/10/18 02:43:27 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


[32m2025-10-18 02:43:27.906[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mCreating schema jhra_raw[0m


25/10/18 02:43:27 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/10/18 02:43:27 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.2
25/10/18 02:43:27 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
25/10/18 02:43:27 WARN TxnHandler: Cannot perform cleanup since metastore table does not exist
25/10/18 02:43:27 WARN ObjectStore: Failed to get database jhra_raw, returning NoSuchObjectException
25/10/18 02:43:27 WARN ObjectStore: Failed to get database jhra_raw, returning NoSuchObjectException
25/10/18 02:43:27 WARN ObjectStore: Failed to get database jhra_raw, returning NoSuchObjectException


[32m2025-10-18 02:43:28.005[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset KAB from /workspace/data/jrdb/KAB[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m
                                                                                

25/10/18 02:43:35 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[32m2025-10-18 02:43:38.336[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset BAC from /workspace/data/jrdb/BAC[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m


25/10/18 02:43:38 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/10/18 02:43:38 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/10/18 02:43:38 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/10/18 02:43:38 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


[32m2025-10-18 02:43:43.648[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset KYI from /workspace/data/jrdb/KYI[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m
[32m2025-10-18 02:44:17.055[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset UKC from /workspace/data/jrdb/UKC[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m
[32m2025-10-18 02:44:27.562[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset OZ from /workspace/data/jrdb/OZ[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m
[32m2025-10-18 02:44:33.283[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset OW from /workspace/data/jrdb/OW[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m
[32m2025-10-18 02:44:38.227[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading JRDB dataset OU from /workspace/data/jrdb/OU[0-9][0-9][0-9][0-9][0-9][0-9].txt[0m


+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
| jhra_raw|raw_jrdb__bac|      false|
| jhra_raw|raw_jrdb__cha|      false|
| jhra_raw|raw_jrdb__cyb|      false|
| jhra_raw|raw_jrdb__hjc|      false|
| jhra_raw|raw_jrdb__kab|      false|
| jhra_raw|raw_jrdb__kyi|      false|
| jhra_raw| raw_jrdb__ot|      false|
| jhra_raw| raw_jrdb__ou|      false|
| jhra_raw| raw_jrdb__ov|      false|
| jhra_raw| raw_jrdb__ow|      false|
| jhra_raw| raw_jrdb__oz|      false|
| jhra_raw|raw_jrdb__sed|      false|
| jhra_raw|raw_jrdb__skb|      false|
| jhra_raw|raw_jrdb__srb|      false|
| jhra_raw|raw_jrdb__tyb|      false|
| jhra_raw|raw_jrdb__ukc|      false|
+---------+-------------+-----------+



# Convert codes to CSV format

Copy and paste text from the code web pages into the following block, run cell, and save as a CSV file in the `seeds` directory.

* [ＪＲＤＢデータコード表](http://www.jrdb.com/program/jrdb_code.txt)
* [脚元コード表（2017.02.20）](http://www.jrdb.com/program/ashimoto_code.txt)
* [馬具コード表（2017.07.02）](http://www.jrdb.com/program/bagu_code.txt)
* [特記コード表（2008.02.23）](http://www.jrdb.com/program/tokki_code.txt)
* [系統コード表（2003.05.15）](http://www.jrdb.com/program/keito_code.txt)
* [調教コースコード表（2009.10.09）](http://www.jrdb.com/program/cyokyo_course_code.txt)
* [追い状態コード表（2008.09.28）](http://www.jrdb.com/program/oi_code.txt)

In [None]:
code_text = """
01      流す
02      余力あり
03      終い抑え
04      一杯
05      バテる
06      伸びる
07      テンのみ
08      鋭く伸び
09      強目
10      終い重点
11      ８分追い
12      追って伸
13      向正面
14      ゲート
15      障害練習
16      中間軽め
17      キリ
21      引っ張る
22      掛かる
23      掛リバテ
24      テン掛る
25      掛り一杯
26      ササル
27      ヨレル
28      バカつく
29      手間取る
99      その他
"""

result = []
for line in code_text.strip().splitlines():
    result.append(line.strip().split())

print(pd.DataFrame(result).to_csv(index=False, header=False))