In [1]:
import json
import typing
from pathlib import Path

In [2]:
ROOT = Path("../assets/pdfs/processed/processed/").resolve()

In [3]:
DictLikeJSON = typing.Dict[str, typing.Any]
ListLikeJSON = typing.List[DictLikeJSON]
JSON = typing.Union[DictLikeJSON, ListLikeJSON]

In [13]:
def read_folder(path: Path) -> typing.Iterator[Path]:
    for file in path.rglob("*.json"):
        yield file
        
        
def read_json(path: Path) -> JSON:
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def rename_fields(data: JSON) -> None:
    return {f"GROBID_{k}":v for k,v in data.items()}
        
        
def left_join(left: ListLikeJSON, right: DictLikeJSON) -> None:
    """Since we only have few parsed .pdf files, we'll start iterating 
    the right obj."""
    right["join_status"] = 0
    right_uuid = right["GROBID_paper_id"]
    for document in left:
        left_uuid = document["uuid"]
        if right_uuid == left_uuid:
            document["GROBID"] = right
            right["join_status"] = 1
            print(left_uuid)

In [10]:
left = read_json(Path("../assets/corpus-with-identifiers.json").resolve())

In [14]:
for file in read_folder(ROOT):
    right = rename_fields(read_json(file))
    left_join(left, right)

68b1d8557fb44235ac0be19016950e9b
f29a1bb4f8c44b49a9f0f10ae84dd09c
301ccfa4c95f4530922cc8764b27febd
f46c395e7b5f4fd49c673083cc0a0f9e
38e092806c274b3daa0a2ec039f95e7b
d95427cc7ecc4c83a4f3d33a7774d034
a226c39c074c4bb9b25417764d640142
ec16364bb78b4ab39fbee1c6dc037b17
2621c26d4fdb4e6690abb08c931e8e27
d9cbda28cad84618b5dda83db9b41c1a
5a86accf46dd469fbb61f3bf48ba7d0a
e1c9588a27794552a068db82ad76b4a5
bdfd96d98dcc4724bf692cf47589a611
e582165efaf7471f9b618da004f11be2
0a832856bdba483e9d7dc09ed001c916
fc48f912774c4451969581f1726e8156
d17e81427c1c44b59871113f847f6809
a210de3f8b4247f5ac398e94d3775c98
2e125e297579458dab6b02e3d35c660a
d978c026c03d4c0784fe2bcdeaf39c84
4c614a28eeae4208998558429321ad84
4128d7b8067c48ce880fafe70557a74b
f75c758c2bbc41d2a6662fce07839deb
c7cd5a8eeed94d1ba25e53c5e6384e0c
f2c76a6b3f244b13aac42d10891be681
dc27e2d6acb845069b2cd18b11105de3
5c0921babec24372990f450fd7217fdc
1a8d7fd0626d4d59a15b200504fd10b5
83247141d5444f2493376cab1754ca9b
15a5c8bcae214b6980c53059b5a340f8
93963c40ec

In [18]:
l = """68b1d8557fb44235ac0be19016950e9b
f29a1bb4f8c44b49a9f0f10ae84dd09c
301ccfa4c95f4530922cc8764b27febd
f46c395e7b5f4fd49c673083cc0a0f9e
38e092806c274b3daa0a2ec039f95e7b
d95427cc7ecc4c83a4f3d33a7774d034
a226c39c074c4bb9b25417764d640142
ec16364bb78b4ab39fbee1c6dc037b17
2621c26d4fdb4e6690abb08c931e8e27
d9cbda28cad84618b5dda83db9b41c1a
5a86accf46dd469fbb61f3bf48ba7d0a
e1c9588a27794552a068db82ad76b4a5
bdfd96d98dcc4724bf692cf47589a611
e582165efaf7471f9b618da004f11be2
0a832856bdba483e9d7dc09ed001c916
fc48f912774c4451969581f1726e8156
d17e81427c1c44b59871113f847f6809
a210de3f8b4247f5ac398e94d3775c98
2e125e297579458dab6b02e3d35c660a
d978c026c03d4c0784fe2bcdeaf39c84
4c614a28eeae4208998558429321ad84
4128d7b8067c48ce880fafe70557a74b
f75c758c2bbc41d2a6662fce07839deb
c7cd5a8eeed94d1ba25e53c5e6384e0c
f2c76a6b3f244b13aac42d10891be681
dc27e2d6acb845069b2cd18b11105de3
5c0921babec24372990f450fd7217fdc
1a8d7fd0626d4d59a15b200504fd10b5
83247141d5444f2493376cab1754ca9b
15a5c8bcae214b6980c53059b5a340f8
93963c40ec8a4b6bb1f11581bc2738d8
b776b54f5f9f43f199ff538df209b94b
9966d2af2ea844ffae4f11100226d55f
495b2630385d4221bf8ad86bb9d443e7
97c95e798dc54f139c32f261f1a42058
5f5213a36f1049e495861cde7a5c5c9f""".split("\n")

In [21]:
subset = []
for item in left:
    if item["uuid"] in l:
        subset.append(item)

In [24]:
with open("../assets/corpus-subset-grobid.json", "w", encoding="utf-8") as f:
    json.dump(subset, f, indent=4, ensure_ascii=False)