# Data Preparation for Nobel Laureates datasets

In [1]:
import json
import os
import pathlib
import subprocess
import tempfile
import urllib.request

from icecream import ic
import watermark

In [2]:
%load_ext watermark
%watermark
%watermark --iversions

Last updated: 2024-11-01T10:55:19.918638-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.29.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

json     : 2.0.9
watermark: 2.5.0



## Init local disk space

Organize the local disk space to be mounted for accessing the SQLite database.

In [3]:
!rm -rf /tmp/sqlite
!mkdir /tmp/sqlite
!touch /tmp/sqlite/G2C.db

Organize the local disk space to be mounted for accessing the input datasets.

In [4]:
!rm -rf /tmp/data
!mkdir /tmp/data

## Prepare the datasets

Transform the `laureates.json` file into JSONL ready for Senzing data load.

In [5]:
url: str = "https://raw.githubusercontent.com/kuzudb/nobel-network/refs/heads/main/data/source_2/laureates.json"
out_path: pathlib.Path = pathlib.Path("/tmp/data") / "laureate.jsonl"

tmp_file: tempfile.NamedTemporaryFile = tempfile.NamedTemporaryFile(
    mode = "w",
    encoding = "utf-8",
    delete = False,
)

try:
    with urllib.request.urlopen(url) as url_fp:
        tmp_file.write(url_fp.read().decode("utf-8"))

    cmd: str = f"jq -c '.[]' < {tmp_file.name}"

    jsonl: subprocess.Popen = subprocess.Popen(
        cmd,
        shell = True,
        stdout = subprocess.PIPE,
    )

    with out_path.open("w", encoding = "utf-8") as out_fp:
        for line in jsonl.stdout.readlines():
            dat: dict = json.loads(line)
            dat["DATA_SOURCE"] = "LAUREATE"
            
            out_fp.write(json.dumps(dat))
            out_fp.write("\n")
except urllib.error.URLError as ex:
    ic(ex.reason)
finally:
    tmp_file.close()
    os.unlink(tmp_file.name)

Transform the `scholars.json` file into JSONL ready for Senzing data load.

In [6]:
url: str = "https://raw.githubusercontent.com/kuzudb/nobel-network/refs/heads/main/data/source_1/scholars.json"
out_path: pathlib.Path = pathlib.Path("/tmp/data") / "scholars.jsonl"

tmp_file: tempfile.NamedTemporaryFile = tempfile.NamedTemporaryFile(
    mode = "w",
    encoding = "utf-8",
    delete = False,
)

try:
    with urllib.request.urlopen(url) as url_fp:
        tmp_file.write(url_fp.read().decode("utf-8"))

    cmd: str = f"jq -c '.[]' < {tmp_file.name}"

    jsonl: subprocess.Popen = subprocess.Popen(
        cmd,
        shell = True,
        stdout = subprocess.PIPE,
    )

    with out_path.open("w", encoding = "utf-8") as out_fp:
        for id, line in enumerate(jsonl.stdout.readlines()):
            dat: dict = json.loads(line)
            dat["DATA_SOURCE"] = "SCHOLARS"
            dat["id"] = id
            
            out_fp.write(json.dumps(dat))
            out_fp.write("\n")
except urllib.error.URLError as ex:
    ic(ex.reason)
finally:
    tmp_file.close()
    os.unlink(tmp_file.name)