In [2]:
import functools
import os
from typing import List

from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from JRDBDataParsingTools.data_schema import load_schema, create_pyspark_schema
from JRDBDataParsingTools.data_parser import parse_line
from JRDBDataParsingTools.file_downloader import download_and_extract_files
from JRDBDataParsingTools.structured_logger import logger

In [3]:
%load_ext dotenv

# Download files from the web

In [4]:
# JRDB credentials
username = os.getenv("JRDB_USERNAME")
password = os.getenv("JRDB_PASSWORD")
# The directory where you want to download the files
# Must be an absolute path
download_dir = "/Users/hankehly/Projects/JRDBDataParsingTools/downloads"

In [5]:
target_dataset_urls = [
    # Taken from http://www.jrdb.com/member/dataindex.html
    # Comment out the ones you don't want to download.
    # Downloading all of them will take about 2-3 hours.
    "http://www.jrdb.com/member/datazip/Kab/index.html",
    "http://www.jrdb.com/member/datazip/Bac/index.html",
    "http://www.jrdb.com/member/datazip/Kyi/index.html",
    "http://www.jrdb.com/member/datazip/Ukc/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index.html",
    "http://www.jrdb.com/member/datazip/Oz/index2.html",  # OW data
    "http://www.jrdb.com/member/datazip/Ou/index.html",
    "http://www.jrdb.com/member/datazip/Ot/index.html",
    "http://www.jrdb.com/member/datazip/Ov/index.html",
    "http://www.jrdb.com/member/datazip/Cyb/index.html",
    "http://www.jrdb.com/member/datazip/Cha/index.html",
    "http://www.jrdb.com/member/datazip/Sed/index.html",
    "http://www.jrdb.com/member/datazip/Skb/index.html",
    "http://www.jrdb.com/member/datazip/Tyb/index.html",
    "http://www.jrdb.com/member/datazip/Hjc/index.html",
]

for webpage_url in target_dataset_urls:
    download_and_extract_files(webpage_url, username, password, download_dir)

{"event": "Downloading and extracting files from http://www.jrdb.com/member/datazip/Tyb/index.html", "level": "info", "timestamp": "2023-12-16T10:34:09.144811Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Tyb/TYB_2022.zip", "level": "info", "timestamp": "2023-12-16T10:34:10.226188Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Tyb/TYB_2021.zip", "level": "info", "timestamp": "2023-12-16T10:34:16.447331Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Tyb/TYB_2020.zip", "level": "info", "timestamp": "2023-12-16T10:34:21.292717Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jrdb.com/member/datazip/Tyb/TYB_2019.zip", "level": "info", "timestamp": "2023-12-16T10:34:25.057548Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Downloading http://www.jr

# Import data into Postgres

In [22]:
spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

jdbc_common_options = {
    "url": "jdbc:postgresql://localhost:5432/jrdb",
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver",
}

datasets = [
    # "KAB",
    # "BAC",
    # "KYI",
    # "UKC",
    # "OZ",
    # "OW",
    # "OU",
    # "OT",
    # "OV",
    # "CYB",
    # "CHA",
    # "SED",
    # "SKB",
    "TYB",
    # "HJC",
]

for dataset in datasets:
    logger.info(f"Processing dataset {dataset}")
    logger.info("Loading schema")
    schema = load_schema(f"schemas/{dataset}.yaml")
    logger.info("Creating PySpark schema")
    spark_schema = create_pyspark_schema(schema)
    logger.info("Creating PySpark DataFrame")
    df = (
        spark.read.format("binaryFile")
        .load(f"downloads/{dataset}*.txt")
        .select("content")
        .rdd.flatMap(lambda x: x[0].splitlines())
        .map(functools.partial(parse_line, schema=schema))
        .toDF(create_pyspark_schema(schema))
        .withColumn("input_file_name", f.input_file_name())
    )
    logger.info("Writing to Postgres")
    (
        df.write.mode("overwrite")
        .format("jdbc")
        .options(**jdbc_common_options)
        .option("dbtable", f"raw.{dataset.lower()}")
        .save()
    )

{"event": "Processing dataset TYB", "level": "info", "timestamp": "2023-12-16T11:01:42.922894Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Loading schema", "level": "info", "timestamp": "2023-12-16T11:01:42.923358Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Creating PySpark schema", "level": "info", "timestamp": "2023-12-16T11:01:42.933306Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Creating PySpark DataFrame", "level": "info", "timestamp": "2023-12-16T11:01:42.933718Z", "logger": "JRDBDataParsingTools.file_downloader"}
{"event": "Writing to Postgres", "level": "info", "timestamp": "2023-12-16T11:01:46.509202Z", "logger": "JRDBDataParsingTools.file_downloader"}
23/12/16 20:01:48 ERROR Executor: Exception in task 11.0 in stage 51.0 (TID 55289)
java.sql.BatchUpdateException: Batch entry 755 INSERT INTO raw.tyb ("レースキー_場コード","レースキー_年","レースキー_回","レースキー_日","レースキー_Ｒ","馬番","ＩＤＭ","騎手指数","情報指数","オッズ指数","パドック指数","予備１","総合指数","馬具変更情報","脚元

Py4JJavaError: An error occurred while calling o704.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 11 in stage 51.0 failed 1 times, most recent failure: Lost task 11.0 in stage 51.0 (TID 55289) (192.168.40.105 executor driver): java.sql.BatchUpdateException: Batch entry 755 INSERT INTO raw.tyb ("レースキー_場コード","レースキー_年","レースキー_回","レースキー_日","レースキー_Ｒ","馬番","ＩＤＭ","騎手指数","情報指数","オッズ指数","パドック指数","予備１","総合指数","馬具変更情報","脚元情報","取消フラグ","騎手コード","騎手名","負担重量","見習い区分","馬場状態コード","天候コード","単勝オッズ","複勝オッズ","オッズ取得時間","馬体重","馬体重増減","オッズ印","パドック印","直前総合印","馬体コード","気配コード","発走時間","input_file_name") VALUES ('06','06','1','7','01','01','22.0','0.1','-1.0','0.0','0.0','0.0','21.1','0','0','0','10414','高山太郎','560','0','  ',' ','','','0000','','','','','','','','','file:///Users/hankehly/Projects/JRDBDataParsingTools/downloads/TYB090905.txt') was aborted: ERROR: invalid byte sequence for encoding "UTF8": 0x00
  Where: unnamed portal parameter $21  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:165)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2401)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2133)
	at org.postgresql.core.v3.QueryExecutorImpl.flushIfDeadlockRisk(QueryExecutorImpl.java:1490)
	at org.postgresql.core.v3.QueryExecutorImpl.sendQuery(QueryExecutorImpl.java:1515)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:559)
	at org.postgresql.jdbc.PgStatement.internalExecuteBatch(PgStatement.java:896)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:919)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1677)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:746)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:902)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:901)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1036)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1036)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.postgresql.util.PSQLException: ERROR: invalid byte sequence for encoding "UTF8": 0x00
  Where: unnamed portal parameter $21
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2712)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2400)
	... 24 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1036)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:407)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1034)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:901)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:65)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at jdk.internal.reflect.GeneratedMethodAccessor103.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.sql.BatchUpdateException: Batch entry 755 INSERT INTO raw.tyb ("レースキー_場コード","レースキー_年","レースキー_回","レースキー_日","レースキー_Ｒ","馬番","ＩＤＭ","騎手指数","情報指数","オッズ指数","パドック指数","予備１","総合指数","馬具変更情報","脚元情報","取消フラグ","騎手コード","騎手名","負担重量","見習い区分","馬場状態コード","天候コード","単勝オッズ","複勝オッズ","オッズ取得時間","馬体重","馬体重増減","オッズ印","パドック印","直前総合印","馬体コード","気配コード","発走時間","input_file_name") VALUES ('06','06','1','7','01','01','22.0','0.1','-1.0','0.0','0.0','0.0','21.1','0','0','0','10414','高山太郎','560','0','  ',' ','','','0000','','','','','','','','','file:///Users/hankehly/Projects/JRDBDataParsingTools/downloads/TYB090905.txt') was aborted: ERROR: invalid byte sequence for encoding "UTF8": 0x00
  Where: unnamed portal parameter $21  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:165)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2401)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2133)
	at org.postgresql.core.v3.QueryExecutorImpl.flushIfDeadlockRisk(QueryExecutorImpl.java:1490)
	at org.postgresql.core.v3.QueryExecutorImpl.sendQuery(QueryExecutorImpl.java:1515)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:559)
	at org.postgresql.jdbc.PgStatement.internalExecuteBatch(PgStatement.java:896)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:919)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1677)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:746)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:902)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:901)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1036)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1036)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: org.postgresql.util.PSQLException: ERROR: invalid byte sequence for encoding "UTF8": 0x00
  Where: unnamed portal parameter $21
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2712)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2400)
	... 24 more


In [15]:
(
    df
    .where(f.col("input_file_name") == "file:///Users/hankehly/Projects/JRDBDataParsingTools/downloads/TYB090905.txt")
    .show()
)

                                                                                

+-------------------+-------------+-------------+-------------+-------------+----+------+--------+--------+----------+------------+------+--------+------------+--------+----------+----------+----------+--------+----------+--------------+----------+----------+----------+--------------+------+----------+--------+----------+----------+----------+----------+--------+--------------------+
|レースキー_場コード|レースキー_年|レースキー_回|レースキー_日|レースキー_Ｒ|馬番|ＩＤＭ|騎手指数|情報指数|オッズ指数|パドック指数|予備１|総合指数|馬具変更情報|脚元情報|取消フラグ|騎手コード|    騎手名|負担重量|見習い区分|馬場状態コード|天候コード|単勝オッズ|複勝オッズ|オッズ取得時間|馬体重|馬体重増減|オッズ印|パドック印|直前総合印|馬体コード|気配コード|発走時間|     input_file_name|
+-------------------+-------------+-------------+-------------+-------------+----+------+--------+--------+----------+------------+------+--------+------------+--------+----------+----------+----------+--------+----------+--------------+----------+----------+----------+--------------+------+----------+--------+----------+----------+----------+----------+--------+--------------------+
