In [1]:
import sys
import os
sys.path.append("../")

import pandas as pd
import json
from tqdm import tqdm
import copy
from utils import text_util, file_util, excel_tree_level_export
from utils import graph_builder_from_wiki as graph_wiki_util
import crawl_wiki_tree
import extract_name_mentions

['nonplanar', 'amorphous']
[]


In [2]:
DATA_DIR = "../data/google_patents/us-25000"
DOC_DIR = os.path.join(DATA_DIR, "doc")
max_level = 3

Format of data file:
- A `.json` file contains multiple patents on different lines.
- Format of each patent:

```json
{
    "patent_id": str,
    "title": [{"text": str, "language": str, "truncated": bool}],
    "description": [{"text": str, "language": str, "truncated": bool}],
    "claims": [{"text": str, "language": str, "truncated": bool}],
    "classifications": [{"code": str, "inventive": bool, "first": bool, "tree": List[str]}, ...]
}

```

In [3]:
doc_files = file_util.get_file_name_in_dir(DOC_DIR, "json")
print("Number of document files:", len(doc_files))

Number of document files: 10


In [4]:
sample_doc_file = doc_files[0]
with open(sample_doc_file, "r") as f:
    for line in f:
        sample_doc = json.loads(line)
        break

In [6]:
sample_doc["classifications"]

[{'code': 'A46B11/0006', 'inventive': True, 'first': True, 'tree': []},
 {'code': 'A46B9/023', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A45D24/28', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B5/0095', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A45D24/22', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B11/002', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B2200/104', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B9/023', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B2200/104', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A45D24/22', 'inventive': False, 'first': False, 'tree': []},
 {'code': 'A46B11/0006', 'inventive': True, 'first': True, 'tree': []},
 {'code': 'A46B5/0095', 'inventive': True, 'first': False, 'tree': []},
 {'code': 'A46B11/002', 'inventive': False, 'first': False, 'tree': []}]

## Extract name mentions from each document

For later reference, save name mentions in each document with `doc_id` into a folder named `name_mention_<doc_id>.txt`

In [5]:
def get_id(doc):
    return doc["patent_id"]

def get_claims(doc):
    return [c["text"] for c in doc["claims"]]
    
def get_name_mentions(doc, max_n_gram_range=3):
    claims = get_claims(doc)
    name_mentions = text_util.get_name_mention_from_claims_nltk(claims)
    n_gram_name_mentions = text_util.generate_n_gram_from_name_mentions(name_mentions, max_n_gram_range)
    return n_gram_name_mentions

In [6]:
name_mentions = get_name_mentions(sample_doc)
print("Name mentions:", name_mentions)

Name mentions: ['wall', 'bristles', 'boot', 'medial section', 'fluid', 'secondary end', 'distal end', 'relative', 'outer', 'straight end section', 'handle', 'medial', 'communication', 'bristle', 'straight end', 'conduit', 'pump', 'undulated medial section', 'openings', 'adjacent', 'straight', 'boots', 'fluid communication', 'weight retains', 'hair', 'retains', 'outer surface', 'brush', 'surface', 'end', 'outer wall', 'undulated', 'tube', 'leading', 'weight', 'section', 'plurality', 'panel', 'primary end', 'claim', 'distal', 'respect', 'primary', 'leading end', 'end section', 'assembly', 'undulated medial', 'secondary']


In [None]:
NAME_MENTION_DIR = os.path.join(DATA_DIR, "name_mentions")
num_name_mentions_file = 50

all_name_mentions = set()
dict_name_mentions = {}

if not os.path.exists(NAME_MENTION_DIR):
    file_util.mkdir(NAME_MENTION_DIR)
    for file_name in tqdm(doc_files, desc="Extracting name mentions"):
        with open(file_name, "r") as f:
            for line in f:
                doc = json.loads(line)
                doc_id, name_mentions = get_id(doc), get_name_mentions(doc)
                dict_name_mentions[doc_id] = name_mentions
                all_name_mentions.update(name_mentions)
    
    
    file_util.dump_json(dict_name_mentions, os.path.join(DATA_DIR, "dict_name_mentions.json"))
    all_name_mentions = list(all_name_mentions)
    print("Number of name mentions to crawl:", len(all_name_mentions))
    extract_name_mentions.split_name_mention_list(all_name_mentions, NAME_MENTION_DIR, num_name_mentions_file)

## Retrieve parents of name mentions

In [None]:
crawl_wiki_tree.search_wiki_with_threads(NAME_MENTION_DIR, 0, num_name_mentions_file, iteration=max_level)

## Trace paths of parents

In [14]:
from collections import deque

def cache_linkto(parent_links):
    linkto_infos = {}
    for link in parent_links:
        src_id, src_label, dest_id = link["id"], link["label"], link["link_to"]
        if src_id not in linkto_infos:
            linkto_infos[src_id] = {"label": src_label, "link_to": set()}
        if dest_id != "":
            linkto_infos[src_id]["link_to"].add(dest_id)

    for src_id in linkto_infos:
        linkto_infos[src_id]["link_to"] = list(linkto_infos[src_id]["link_to"])
    
    return linkto_infos


def trace_path(root_id, entity, max_level):
    linkto_infos = cache_linkto(entity["parents"])
    labels = {root_id: entity["label"]}
    parents = []
    
    # BFS to get parent up to max_level
    queue = deque()
    queue.append(root_id)
    level = {root_id : 0}
    path = {root_id: str(root_id)}
    
    while len(queue) != 0:
        u = queue.popleft()
        for v in linkto_infos[u]["link_to"]:
            if v not in level and v in linkto_infos:
                level[v] = level[u] + 1
                path[v] = "{} >> {}".format(path[u], v)
                labels[v] = linkto_infos[v]["label"]
                parents.append({
                    "id": v, 
                    "level": level[v], 
                    "label": labels[v],
                    "path": path[v],
                })
                if level[v] < max_level:
                    queue.append(v)
    
    return parents, labels

In [23]:
sample_entity_file = file_util.get_file_name_in_dir_regex(NAME_MENTION_DIR, "entities.pck")[5]
sample_entity = file_util.load(sample_entity_file)
sample_entity

{}

In [None]:
for name_mention_eid, entity in sample_entity.items():
    try:
        print(trace_path(name_mention_eid, entity, max_level=max_level))
    except KeyError:
        print("TRACING ERROR", name_mention_eid)

In [33]:
ENTITY_LABEL_PATH = os.path.join(DATA_DIR, "entity_labels.json")

file_names = file_util.get_file_name_in_dir(NAME_MENTION_DIR, "txt")
entity_labels = {}

for file_name in tqdm(file_names, desc="Tracing parents"):
    base_name = os.path.splitext(file_name)[0]
    entity_path = base_name + "_entities.pck"
    parent_path = base_name + "_parents.json"
    error_path = base_name + "_entities_cannot_trace_parents.json"
    
    file_entity = file_util.load(entity_path)
    entity_parents = {}
    errors = []
    
    for name_mention_eid, entity in file_entity.items():
        try:
            parents, labels = trace_path(name_mention_eid, entity, max_level=max_level)
        except KeyError:
            errors.append(file_entity[name_mention_eid])
            continue
        entity_parents[name_mention_eid] = {"name_mention": entity["name_mention"], "label": entity["label"], "parents": parents}
        entity_labels.update(labels)
        
    file_util.dump_json(entity_labels, ENTITY_LABEL_PATH)
    file_util.dump_json(entity_parents, parent_path)
    file_util.dump_json(errors, error_path)

Tracing parents:   0%|          | 0/10788 [00:00<?, ?it/s]
