In [1]:
import os
import json
import sys
sys.path.append("../")

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from utils import file_util

In [2]:
data_dir = "../data/google_patents/24054-jp-patents"
seed = 6
test_size = 0.2
!ls $data_dir

data.ndjson		 doc		     graphs
dict_name_mentions.json  entity_labels.json  name_mentions


Format of data file:
- A `.json` file contains multiple patents on different lines.
- Format of each patent:

```json
{
    "title": str,
    "pdf": str,
    "description": List[str],      # should be concatenated
    "claims": List[str],
    "classifications": List[str],   # each string uses "\n" to separate the cluster ID and its description 
}

```

In [3]:
doc_files = file_util.get_file_name_in_dir(os.path.join(data_dir, "doc"), "json")
print("Number of document files:", len(doc_files))

Number of document files: 24054


In [4]:
sample_doc_file = doc_files[2]
with open(sample_doc_file, "r") as f:
    sample_doc = json.load(f)
    print("Document attributes:", sample_doc.keys())

Document attributes: dict_keys(['title', 'pdf', 'classifications', 'description', 'claims'])


In [5]:
sample_doc["classifications"]

['A61N1/0456\nSpecially adapted for transcutaneous electrical nerve stimulation [TENS]',
 'A61N1/0526\nHead electrodes',
 'A61N1/0551\nSpinal or peripheral nerve electrodes',
 'A61N1/36017\nExternal stimulators, e.g. with patch electrodes with leads or electrodes penetrating the skin',
 'A61N1/36025\nExternal stimulators, e.g. with patch electrodes for treating a mental or cerebral condition',
 'A61N1/36031\nControl systems using physiological parameters for adjustment',
 'A61N1/36034\nControl systems specified by the stimulation parameters',
 'A61N1/36053\nImplantable neurostimulators for stimulating central or peripheral nerve system adapted for vagal stimulation',
 'A61N1/36064\nEpilepsy',
 'A61N1/36085\nEating disorders or obesity',
 'A61N1/36089\nAddiction or withdrawal from substance abuse such as alcohol or drugs',
 'A61N1/36096\nMood disorders, e.g. depression, anxiety or panic disorder',
 'A61N1/361\nPhantom sensations, e.g. tinnitus',
 'A61N1/36114\nCardiac control, e.g. by v

In [6]:
def get_id(doc_file_name):
    x = os.path.basename(doc_file_name)
    return os.path.splitext(x)[0]
    
def get_content(doc):
    return "\n".join(doc["claims"])

def get_label(doc):
    global labels
    res = [x.split("\n")[0] for x in sample_doc["classifications"]]
    labels.update(res)
    return res
    
def get_title(doc):
    return doc["title"]

In [7]:
f = open(os.path.join(data_dir, "data.ndjson"), "w")
labels = set()
train_doc_ids, _ = train_test_split(range(len(doc_files)), test_size=test_size, random_state=seed)

for i, filename in enumerate(tqdm(doc_files, desc="Writing")):
    with open(filename, "r") as fin:
        doc = json.load(fin)
        sample = {
            "doc_id": get_id(filename),
            "title": get_title(doc),
            "content": get_content(doc),
            "labels": get_label(doc),
            "is_train": i in train_doc_ids,
        }
    f.write(json.dumps(sample, ensure_ascii=False) + "\n")
f.close()        

Writing: 100%|██████████| 24054/24054 [01:58<00:00, 202.47it/s]


In [8]:
label_ids = {x : i for i, x in enumerate(labels)}
file_util.dump_json(label_ids, os.path.join(data_dir, "document_label_ids.json"))