In [1]:
import sys
import os
sys.path.append("../")

import pandas as pd
import json
from tqdm import tqdm
import copy
from utils import text_util, file_util, excel_tree_level_export
from utils import graph_builder_from_wiki as graph_wiki_util
import crawl_wiki_tree

['nonplanar', 'amorphous']
[]


In [2]:
DATA_DIR = "../data"
DOC_PATH = os.path.join(DATA_DIR, "reuters.csv")
max_level = 3

In [3]:
df = pd.read_csv(DOC_PATH)
df.head()

Unnamed: 0,path,topic,subset,index,content,lead,tin,retail,fuel,propane,...,soy-meal,earn,sun-oil,instal-debt,cotton,heat,trade,dfl,palladium,iron-steel
0,test/14826,['trade'],test,14826,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,test/14828,['grain'],test,14828,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test/14829,"['nat-gas', 'crude']",test,14829,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test/14832,"['rubber', 'tin', 'sugar', 'corn', 'rice', 'gr...",test,14832,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,test/14833,"['palm-oil', 'veg-oil']",test,14833,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# df = df.sample(5)
print("Data shape:", df.shape)
sample_content = df.sample(1)["content"].values[0]
print("A sample news content:")
print(sample_content[:5000])

Data shape: (10788, 95)
A sample news content:
TURKEY CALLS FOR DIALOGUE TO SOLVE DISPUTE
  Turkey said today its disputes with
  Greece, including rights on the continental shelf in the Aegean
  Sea, should be solved through negotiations.
      A Foreign Ministry statement said the latest crisis between
  the two NATO members stemmed from the continental shelf dispute
  and an agreement on this issue would effect the security,
  economy and other rights of both countries.
      "As the issue is basicly political, a solution can only be
  found by bilateral negotiations," the statement said. Greece has
  repeatedly said the issue was legal and could be solved at the
  International Court of Justice.
      The two countries approached armed confrontation last month
  after Greece announced it planned oil exploration work in the
  Aegean and Turkey said it would also search for oil.
      A face-off was averted when Turkey confined its research to
  territorrial waters. "The latest crise

In [5]:
label_columns = [x for x in df.columns if x not in ("path", "topic", "subset", "index", "content")]

print("Number of labels:", len(label_columns))
print("Labels:")
print(*label_columns, sep=", ")

Number of labels: 90
Labels:
lead, tin, retail, fuel, propane, crude, income, oat, copra-cake, barley, groundnut, cotton-oil, rand, cpi, lei, cocoa, groundnut-oil, jobs, nkr, livestock, castor-oil, palmkernel, money-fx, sunseed, hog, nat-gas, zinc, coconut-oil, gas, rape-oil, gold, orange, pet-chem, wheat, nickel, jet, interest, carcass, bop, l-cattle, potato, rapeseed, sugar, coffee, soy-oil, money-supply, platinum, yen, wpi, ship, soybean, sorghum, lin-oil, dmk, meal-feed, coconut, rice, dlr, alum, oilseed, acq, reserves, ipi, corn, grain, housing, nzdlr, naphtha, strategic-metal, palm-oil, sun-meal, lumber, tea, rye, rubber, gnp, veg-oil, cpu, silver, copper, soy-meal, earn, sun-oil, instal-debt, cotton, heat, trade, dfl, palladium, iron-steel


**Text normalization**

Since news content is splitted into multiple line (separator `\n`), we first concatenate them into a single paragraph.
Also, we use new line `\n` to separate the news body from its heading, which is the first capitalized line.

In [6]:
def normalize(text):
    split_text = text.split("\n")
    heading = split_text.pop(0).strip()
    content = " ".join(x.strip() for x in split_text).strip()
    merge_text = "{}\n{}".format(heading, content)
    return merge_text

# Testing with a sample
print(normalize(sample_content))

TURKEY CALLS FOR DIALOGUE TO SOLVE DISPUTE
Turkey said today its disputes with Greece, including rights on the continental shelf in the Aegean Sea, should be solved through negotiations. A Foreign Ministry statement said the latest crisis between the two NATO members stemmed from the continental shelf dispute and an agreement on this issue would effect the security, economy and other rights of both countries. "As the issue is basicly political, a solution can only be found by bilateral negotiations," the statement said. Greece has repeatedly said the issue was legal and could be solved at the International Court of Justice. The two countries approached armed confrontation last month after Greece announced it planned oil exploration work in the Aegean and Turkey said it would also search for oil. A face-off was averted when Turkey confined its research to territorrial waters. "The latest crises created an historic opportunity to solve the disputes between the two countries," the Foreign

In [7]:
df["content"] = df["content"].apply(normalize)

## Extract name mentions from each document

For later reference, save name mentions in each document with `doc_id` into a folder named `name_mention_<doc_id>.txt`

In [8]:
NAME_MENTION_DIR = os.path.join(DATA_DIR, "reuters_name_mentions")

In [None]:
# # Testing with a sample
# name_mentions = text_util.get_name_mention_from_claims_nltk([sample_content])
# print(name_mentions)

In [None]:
# file_util.mkdir(NAME_MENTION_DIR)

# for rid, row in tqdm(df.iterrows(), desc="Getting name mentions", total=len(df)):
#     doc_id, doc_content = row["index"], row["content"]
#     name_mentions = text_util.get_name_mention_from_claims_nltk([doc_content])
    
#     with open(os.path.join(NAME_MENTION_DIR, f"{doc_id}.txt"), "w") as f:
#         for e in name_mentions:
#             f.write(e + "\n")

## Retrieve parents of name mentions

In [None]:
# crawl_wiki_tree.search_wiki_with_threads(NAME_MENTION_DIR, 0, None, iteration=max_level)

## Trace paths of parents

In [9]:
from collections import deque

def cache_linkto(parent_links):
    linkto_infos = {}
    for link in parent_links:
        src_id, src_label, dest_id = link["id"], link["label"], link["link_to"]
        if src_id not in linkto_infos:
            linkto_infos[src_id] = {"label": src_label, "link_to": set()}
        if dest_id != "":
            linkto_infos[src_id]["link_to"].add(dest_id)

    for src_id in linkto_infos:
        linkto_infos[src_id]["link_to"] = list(linkto_infos[src_id]["link_to"])
    
    return linkto_infos


def trace_path(root_id, entity, max_level):
    linkto_infos = cache_linkto(entity["parents"])
    labels = {root_id: entity["label"]}
    parents = []
    
    # BFS to get parent up to max_level
    queue = deque()
    queue.append(root_id)
    level = {root_id : 0}
    path = {root_id: str(root_id)}
    
    while len(queue) != 0:
        u = queue.popleft()
        for v in linkto_infos[u]["link_to"]:
            if v not in level and v in linkto_infos:
                level[v] = level[u] + 1
                path[v] = "{} >> {}".format(path[u], v)
                labels[v] = linkto_infos[v]["label"]
                parents.append({
                    "id": v, 
                    "level": level[v], 
                    "label": labels[v],
                    "path": path[v],
                })
                if level[v] < max_level:
                    queue.append(v)
    
    return parents, labels

In [35]:
sample_entity_file = file_util.get_file_name_in_dir_regex(NAME_MENTION_DIR, "entities.pck")[0]
sample_entity = file_util.load(sample_entity_file)

for name_mention_eid, entity in sample_entity.items():
    try:
        trace_path(name_mention_eid, entity, max_level=max_level)
    except KeyError:
        print("TRACING ERROR", name_mention_eid)

TRACING ERROR Q23947147


In [33]:
ENTITY_LABEL_PATH = os.path.join(DATA_DIR, "entity_labels.json")

file_names = file_util.get_file_name_in_dir(NAME_MENTION_DIR, "txt")
entity_labels = {}

for file_name in tqdm(file_names, desc="Tracing parents"):
    base_name = os.path.splitext(file_name)[0]
    entity_path = base_name + "_entities.pck"
    parent_path = base_name + "_parents.json"
    error_path = base_name + "_entities_cannot_trace_parents.json"
    
    file_entity = file_util.load(entity_path)
    entity_parents = {}
    errors = []
    
    for name_mention_eid, entity in file_entity.items():
        try:
            parents, labels = trace_path(name_mention_eid, entity, max_level=max_level)
        except KeyError:
            errors.append(file_entity[name_mention_eid])
            continue
        entity_parents[name_mention_eid] = {"name_mention": entity["name_mention"], "label": entity["label"], "parents": parents}
        entity_labels.update(labels)
        
    file_util.dump_json(entity_labels, ENTITY_LABEL_PATH)
    file_util.dump_json(entity_parents, parent_path)
    file_util.dump_json(errors, error_path)

Tracing parents:   0%|          | 0/10788 [00:00<?, ?it/s]
