In [1]:
import os
import json
import sys
sys.path.append("../")

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from utils import file_util

In [2]:
data_dir = "../data/google_patents/us-25000"
seed = 6
test_size = 0.2
!ls $data_dir

crawl_wiki_concepts.log  document_label_ids.json  mask.pck
data.ndjson		 entities_ids.json	  name_mention_links.json
doc			 entities_labels.json	  name_mentions
doc_name_mentions.json	 entity_labels.json
document_ids.json	 graph.bin


Format of data file:
- A `.json` file contains multiple patents on different lines.
- Format of each patent:

```json
{
    "patent_id": str,
    "title": [{"text": str, "language": str, "truncated": bool}],
    "description": [{"text": str, "language": str, "truncated": bool}],
    "claims": [{"text": str, "language": str, "truncated": bool}],
    "classifications": [{"code": str, "inventive": bool, "first": bool, "tree": List[str]}, ...]
}

```

In [3]:
doc_files = file_util.get_file_name_in_dir(os.path.join(data_dir, "doc"), "json")
print("Number of document files:", len(doc_files))

Number of document files: 10


In [4]:
sample_doc_file = doc_files[0]
with open(sample_doc_file, "r") as f:
    for line in f:
        sample_doc = json.loads(line)
        break
sample_doc["classifications"]

[{'code': 'A46B11/0006', 'inventive': True, 'first': True, 'tree': []},
 {'code': 'A46B9/023', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A45D24/28', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B5/0095', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A45D24/22', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B11/002', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B2200/104', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B9/023', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B2200/104', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A45D24/22', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B11/0006', 'inventive': True, 'first': True, 'tree': []},
 {'code': 'A46B5/0095', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B11/002', 'inventive': False, 'first': False, 'tree': []}]

In [5]:
def get_id(doc):
    return doc["patent_id"]

def get_content(doc):
    return "\n".join(c["text"] for c in doc["claims"])

def get_title(doc):
    return "\n".join(c["text"] for c in doc["title"])

def get_label(doc):
    global labels
    res = [x["code"] for x in doc["classifications"]]
    labels.update(res)
    return res

In [6]:
f = open(os.path.join(data_dir, "data.ndjson"), "w")
labels = set()

for i, filename in enumerate(doc_files):
    print(f"Process Compressed File: {i + 1}/{len(doc_files)}")
    with open(filename, "r") as fin:
        num_documents = 0
        for _ in fin:
            num_documents += 1
        fin.seek(0)
        train_doc_ids, _ = train_test_split(range(num_documents), test_size=test_size, random_state=seed)
        for rid, line in enumerate(tqdm(fin, total=num_documents, desc="Writing")):
            doc = json.loads(line)
            sample = {
                "doc_id": get_id(doc),
                "title": get_title(doc),
                "content": get_content(doc),
                "labels": get_label(doc),
                "is_train": rid in train_doc_ids,
            }
            f.write(json.dumps(sample, ensure_ascii=False) + "\n")
f.close()        

Process Compressed File: 1/10


Writing: 100%|██████████| 2546/2546 [00:01<00:00, 1325.36it/s]


Process Compressed File: 2/10


Writing: 100%|██████████| 2504/2504 [00:01<00:00, 1722.82it/s]


Process Compressed File: 3/10


Writing: 100%|██████████| 2480/2480 [00:01<00:00, 1678.66it/s]


Process Compressed File: 4/10


Writing: 100%|██████████| 2456/2456 [00:01<00:00, 1592.34it/s]


Process Compressed File: 5/10


Writing: 100%|██████████| 2374/2374 [00:01<00:00, 1651.18it/s]


Process Compressed File: 6/10


Writing: 100%|██████████| 2544/2544 [00:01<00:00, 1820.28it/s]


Process Compressed File: 7/10


Writing: 100%|██████████| 2521/2521 [00:01<00:00, 1825.45it/s]


Process Compressed File: 8/10


Writing: 100%|██████████| 2517/2517 [00:01<00:00, 1631.56it/s]


Process Compressed File: 9/10


Writing: 100%|██████████| 2513/2513 [00:01<00:00, 1872.22it/s]


Process Compressed File: 10/10


Writing: 100%|██████████| 2545/2545 [00:01<00:00, 1445.00it/s]


In [7]:
label_ids = {x : i for i, x in enumerate(labels)}
file_util.dump_json(label_ids, os.path.join(data_dir, "document_label_ids.json"))