In [2]:
import os
import sys
import random
sys.path.append("../")

import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from utils import file_util

In [3]:
DATA_DIR = "../data"
NAME_MENTION_DIR = os.path.join(DATA_DIR, "reuters_name_mentions")
entity_labels = file_util.load_json(os.path.join(DATA_DIR, "entity_labels.json"))

In [4]:
def extract_relations(parents, only_return_path_name=False):
    entity_path = {}
    for parent_info in parents:
        entity_path[parent_info["path"]] = parent_info
    
    for parent_info in parents:
        if parent_info["level"] == 3:
            path_name = " >> ".join(x for x in parent_info["path"].split(" >> ")[1:])
            if only_return_path_name:
                yield path_name
            else:
                rela = {}
                while True:
                    level = parent_info["level"]
                    rela[f"Level {level}"] = "\n".join((parent_info["id"], parent_info["label"], parent_info["path"]))
                    child_path = " >> ".join(x for x in parent_info["path"].split(" >> ")[:-1])
                    if level == 1:
                        break
                    parent_info = entity_path[child_path]
                yield rela, path_name

In [5]:
file_names = file_util.get_file_name_in_dir(NAME_MENTION_DIR, "txt")
file_name = random.sample(file_names, 1)[0]

In [6]:
freq = {}
for file_name in tqdm(file_names):
    base_name = os.path.splitext(file_name)[0]
    parent_path = base_name + "_parents.json"
    parents = file_util.load_json(parent_path)
    
    for name_mention_eid, entity in parents.items():
        for path_name in extract_relations(entity["parents"], only_return_path_name=True):
            freq[path_name] = freq.get(path_name, 0) + 1

100%|██████████| 10788/10788 [01:02<00:00, 173.44it/s]


In [13]:
entity_level_dict = {}
listed = {}

for file_name in tqdm(file_names):
    for name_mention_eid, entity in parents.items():
        if name_mention_eid not in entity_level_dict:
            entity_level_dict[name_mention_eid] = []
            listed[name_mention_eid] = set()
            
        for row, path_name in extract_relations(entity["parents"]):
            row["Level 0"] = entity["label"]
            row["Occurence"] = freq[path_name]
            if path_name not in listed[name_mention_eid]:
                entity_level_dict[name_mention_eid].append(row)
                listed[name_mention_eid].add(path_name)

100%|██████████| 10788/10788 [01:51<00:00, 96.81it/s] 


In [16]:
writer = pd.ExcelWriter('entity_level_dict.xlsx', engine='xlsxwriter')
for name_mention_eid, infor in tqdm(entity_level_dict.items()):
    df = pd.DataFrame(infor)
    df = df[sorted(df.columns)]
    df.to_excel(writer, sheet_name=name_mention_eid)
writer.save()

100%|██████████| 298/298 [00:00<00:00, 336.74it/s]
