In [1]:
import functools
import os
import datetime
from typing import List
from pathlib import Path
import re
import pandas as pd
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from JapanHorseRaceAnalytics.data.data_schema import load_schema, create_pyspark_schema
from JapanHorseRaceAnalytics.data.data_parser import parse_line
from JapanHorseRaceAnalytics.data.file_downloader import download_and_extract_files
from JapanHorseRaceAnalytics.utilities.structured_logger import logger

# Download files from the web

In [2]:
# JRDB credentials
username = os.getenv("JRDB_USERNAME")
password = os.getenv("JRDB_PASSWORD")
# The directory where you want to download the files
# Must be an absolute path
download_dir = "/Users/hankehly/Projects/JapanHorseRaceAnalytics/data/jrdb"

In [4]:
target_dataset_urls = [
    # Taken from http://www.jrdb.com/member/dataindex.html
    # Comment out the ones you don't want to download.
    # Downloading all of them will take about ?
    "http://www.jrdb.com/member/datazip/Kab/index.html",
    "http://www.jrdb.com/member/datazip/Bac/index.html",
    "http://www.jrdb.com/member/datazip/Kyi/index.html",
    "http://www.jrdb.com/member/datazip/Ukc/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index.html",
    # "http://www.jrdb.com/member/datazip/Oz/index2.html",  # OW data
    # "http://www.jrdb.com/member/datazip/Ou/index.html",
    # "http://www.jrdb.com/member/datazip/Ot/index.html",
    # "http://www.jrdb.com/member/datazip/Ov/index.html",
    "http://www.jrdb.com/member/datazip/Cyb/index.html",
    "http://www.jrdb.com/member/datazip/Cha/index.html",
    "http://www.jrdb.com/member/datazip/Sed/index.html",
    "http://www.jrdb.com/member/datazip/Skb/index.html",
    "http://www.jrdb.com/member/datazip/Tyb/index.html",
    "http://www.jrdb.com/member/datazip/Hjc/index.html",
]

for webpage_url in target_dataset_urls:
    download_and_extract_files(
        webpage_url, username, password, download_dir, start_date=datetime.date(2023, 12, 1)
    )

{"event": "Downloading and extracting files from http://www.jrdb.com/member/datazip/Kab/index.html", "level": "info", "timestamp": "2023-12-30T19:01:21.038827Z", "logger": "JapanHorseRaceAnalytics.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/2024/KAB240108.zip", "level": "info", "timestamp": "2023-12-30T19:01:23.536576Z", "logger": "JapanHorseRaceAnalytics.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/2024/KAB240107.zip", "level": "info", "timestamp": "2023-12-30T19:01:23.536866Z", "logger": "JapanHorseRaceAnalytics.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/2024/KAB240106.zip", "level": "info", "timestamp": "2023-12-30T19:01:23.537184Z", "logger": "JapanHorseRaceAnalytics.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Kab/2023/KAB231228.zip", "level": "info", "timestamp": "2023-12-30T19:01:23.537337Z", "logger": "JapanHorseRaceAnalytics.file_downloader

In [3]:
spark = SparkSession.builder \
    .appName("PythonNotebookSparkSession") \
    .config("spark.sql.warehouse.dir", "/Users/hankehly/Projects/JapanHorseRaceAnalytics/spark-warehouse") \
    .config("spark.jars", "/Users/hankehly/Projects/JapanHorseRaceAnalytics/jars/postgresql-42.7.1.jar") \
    .config("spark.executor.extraClassPath", "/Users/hankehly/Projects/JapanHorseRaceAnalytics/jars/postgresql-42.7.1.jar") \
    .config("spark.driver.extraClassPath", "/Users/hankehly/Projects/JapanHorseRaceAnalytics/jars/postgresql-42.7.1.jar") \
    .enableHiveSupport() \
    .getOrCreate()

24/02/04 15:08:24 WARN Utils: Your hostname, Hanks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.40.105 instead (on interface en0)
24/02/04 15:08:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/02/04 15:08:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/04 15:08:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Record which files were downloaded
# Import data into Postgres

## Todo: incremental processing

In [11]:
# from pyspark.sql.types import StringType
# df_files = spark.createDataFrame(os.listdir(download_dir), StringType())
# df_files.write.mode("overwrite").format("jdbc").options(
#     url="jdbc:postgresql://localhost:5432/jrdb",
#     user="admin",
#     password="admin",
#     driver="org.postgresql.Driver",
#     dbtable="jrdb_raw.processed_files",
# ).save()

# df_processed_files = (
#     spark.read.jdbc(
#         url="jdbc:postgresql://localhost:5432/jrdb",
#         table="jrdb_raw.processed_files",
#         properties={
#             "user": "admin",
#             "password": "admin",
#             "driver": "org.postgresql.Driver",
#         },
#     )
#     .alias("df_processed_files")
#     .select("value")
# )

# df_data = (
#     spark.read.format("binaryFile")
#     .load("file:///Users/hankehly/Projects/JapanHorseRaceAnalytics/downloads/BAC*.txt")
#     .withColumn("filename", f.element_at(f.split(f.col("path"), "/"), -1))
#     # .rdd.flatMap(lambda x: x[0].splitlines())
#     # .show(truncate=False)
# )

# unprocessed_files = df_data.join(df_processed_files, df_data.filename == df_processed_files.value, "left_anti")
# unprocessed_files.show(truncate=False)

In [4]:
def etl(
    spark,
    schema_path: str,
    data_path: str | List[str],
    dbtable: str,
    surrogate_key_name: str,
):
    logger.info(f"Processing dataset {dbtable}")
    schema = load_schema(schema_path)
    logger.info("Creating PySpark DataFrame")
    df = (
        spark.read.format("binaryFile")
        .load(data_path)
        .select("content")
        .rdd.flatMap(lambda x: x[0].splitlines())
        .map(functools.partial(parse_line, schema=schema))
        .toDF(create_pyspark_schema(schema))
        # Todo: monotonic increasing id does not mean files with lower dates will have lower ids!
        .withColumn(surrogate_key_name, f.monotonically_increasing_id())
        # Returns the wrong file name..
        # .withColumn("input_file_name", f.input_file_name())
    )
    logger.info("Writing to data warehouse")
    df.write.mode("overwrite").saveAsTable(dbtable)

The following TYB file in the annual pack contains null byte characters. Its daily file counterpart does not, so we must replace it before the file can be processed.
* TYB060121.txt

Starting 2021-09-04, TYB files are duplicated in the annual pack. One file name contains a "_t" while the other does not. The daily file counterpart contains the same information as the annual pack file whose name does not contain a "_t" in it. In addition, some of the "_t" files contain null byte characters. The following files are affected. All files with "_t" in the name are ignored when parsing.
* TYB210904_t.txt
* TYB210905_t.txt
* TYB210911_t.txt

In [5]:
datasets = [
    "KAB",
    "BAC",
    "KYI",
    "UKC",
    "OZ",
    "OW",
    "OU",
    "OT",
    "OV",
    "CYB",
    "CHA",
    "SKB",
    "HJC",
    "SED",  # Run remove_sed_duplicates.sql after loading this dataset
    "TYB",
]

schema_name = "jhra_raw"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")

for dataset in datasets:
    schema_path = f"schemas/{dataset}.yaml"
    dbtable = f"{schema_name}.raw_jrdb__{dataset.lower()}"
    surrogate_key_name = f"{dataset.lower()}_sk"
    # TYB is a special case because the file names are not consistent
    if dataset == "TYB":
        tyb_pattern = re.compile(r"TYB\d{6}\.txt$")
        tyb_files_glob = Path(download_dir).glob("TYB*.txt")
        tyb_files = [
            str(file) for file in tyb_files_glob if tyb_pattern.match(file.name)
        ]
        etl(
            spark,
            schema_path=schema_path,
            data_path=tyb_files,
            dbtable=dbtable,
            surrogate_key_name=surrogate_key_name,
        )
    else:
        etl(
            spark,
            schema_path=schema_path,
            data_path=str(Path(download_dir).joinpath(f"{dataset}*.txt")),
            dbtable=dbtable,
            surrogate_key_name=surrogate_key_name,
        )

24/02/04 15:08:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/02/04 15:08:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
{"event": "Processing dataset jhra_raw.raw_jrdb__kab", "level": "info", "timestamp": "2024-02-04T06:08:30.929864Z", "logger": "__main__"}
{"event": "Creating PySpark DataFrame", "level": "info", "timestamp": "2024-02-04T06:08:30.937790Z", "logger": "__main__"}
{"event": "Writing to data warehouse", "level": "info", "timestamp": "2024-02-04T06:08:36.155540Z", "logger": "__main__"}
24/02/04 15:08:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/02/04 15:08:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/04 15:08:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of 

# Convert codes to CSV format

Copy and paste text from the code web pages into the following block, run cell, and save as a CSV file in the `seeds` directory.

* [ＪＲＤＢデータコード表](http://www.jrdb.com/program/jrdb_code.txt)
* [脚元コード表（2017.02.20）](http://www.jrdb.com/program/ashimoto_code.txt)
* [馬具コード表（2017.07.02）](http://www.jrdb.com/program/bagu_code.txt)
* [特記コード表（2008.02.23）](http://www.jrdb.com/program/tokki_code.txt)
* [系統コード表（2003.05.15）](http://www.jrdb.com/program/keito_code.txt)
* [調教コースコード表（2009.10.09）](http://www.jrdb.com/program/cyokyo_course_code.txt)
* [追い状態コード表（2008.09.28）](http://www.jrdb.com/program/oi_code.txt)

In [16]:
code_text = """
01      流す
02      余力あり
03      終い抑え
04      一杯
05      バテる
06      伸びる
07      テンのみ
08      鋭く伸び
09      強目
10      終い重点
11      ８分追い
12      追って伸
13      向正面
14      ゲート
15      障害練習
16      中間軽め
17      キリ
21      引っ張る
22      掛かる
23      掛リバテ
24      テン掛る
25      掛り一杯
26      ササル
27      ヨレル
28      バカつく
29      手間取る
99      その他
"""

result = []
for line in code_text.strip().splitlines():
    result.append(line.strip().split())

print(pd.DataFrame(result).to_csv(index=False, header=False))

01,流す
02,余力あり
03,終い抑え
04,一杯
05,バテる
06,伸びる
07,テンのみ
08,鋭く伸び
09,強目
10,終い重点
11,８分追い
12,追って伸
13,向正面
14,ゲート
15,障害練習
16,中間軽め
17,キリ
21,引っ張る
22,掛かる
23,掛リバテ
24,テン掛る
25,掛り一杯
26,ササル
27,ヨレル
28,バカつく
29,手間取る
99,その他

