# Requirements

ManualDataset requires the original dataset from [Wichmann et al.](https://github.com/pwichmann/supply_chain_mining). To obtain the dataset, please go to the following link (https://github.com/pwichmann/supply_chain_mining) and contact the author. When you have successfully obtained the dataset, move the 'training_data' folder of the dataset to the root of this repository.

In [1]:
import pandas as pd
import ftfy
import os
import re
from datasets import DatasetDict, Dataset
from pathlib import Path
import spacy
from tqdm.auto import tqdm

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 3.1 MB/s eta 0:00:04
     ------------ --------------------------- 3.9/12.8 MB 7.6 MB/s eta 0:00:02
     ------------------------------- ------- 10.5/12.8 MB 14.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 15.7 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
nlp = spacy.load("en_core_web_sm")
def split_sentences(v):
    return [x.text for x in nlp(v).sents]

In [4]:
cwd = os.getcwd()
original_dataset_path = Path(cwd, "../../training_data").resolve()
assert original_dataset_path.is_dir()
original_dataset_train_path = original_dataset_path / "01_initial_corpus" / "train.json"
assert original_dataset_train_path.is_file()
original_dataset_valid_path = original_dataset_path / "01_initial_corpus" / "dev.json"
assert original_dataset_valid_path.is_file()
original_dataset_test_path = original_dataset_path / "01_initial_corpus" / "test.json"
assert original_dataset_test_path.is_file()

In [5]:
def fix_text(v):
    v = ftfy.fix_text(v)
    v = v.replace("\xa0", "").replace("\n", " ").replace("—", "-")
    v = re.sub(
        r"\(.*?\)|\[.*?\]|<.*?>|\{.*?\}", "", v
    )
    v = re.sub(r"\s{2,}", " ", v).strip()
    return v

def whitespace_tokenizer(sentence):
    tokens = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)
    spaces = []
    s = ""
    for token in tokens:
        if s == "":
            s = token
            continue
        c = sentence[len(s)]
        if c == " ":
            spaces.append(1)
            s += " " + token
        else:
            spaces.append(0)
            s += token
    return tokens, spaces


def reconstruct_sentence(tokens, spaces):
    sentence = ""
    for token, space in zip(tokens, spaces + [0]):
        sentence += token + " " * space
    return sentence


def find_ne_parts(list1, list2):
    ne_from_indices = []
    ne_to_indices = []
    ne_other_indices = []

    i1, i2 = 0, 0
    while i2 < len(list2):
        if list2[i2] == "__NE_FROM__":
            i2 += 1
            start_idx = i1
            while i1 < len(list1) and (i2 >= len(list2) or list1[i1] != list2[i2]):
                i1 += 1
            ne_from_indices.append((start_idx, i1))
        elif list2[i2] == "__NE_TO__":
            i2 += 1
            start_idx = i1
            while i1 < len(list1) and (i2 >= len(list2) or list1[i1] != list2[i2]):
                i1 += 1
            ne_to_indices.append((start_idx, i1))
        elif list2[i2] == "__NE_OTHER__":
            i2 += 1
            start_idx = i1
            while i1 < len(list1) and (i2 >= len(list2) or list1[i1] != list2[i2]):
                i1 += 1
            ne_other_indices.append((start_idx, i1))
        else:
            if i1 < len(list1) and list1[i1] == list2[i2]:
                i1 += 1
            i2 += 1

    return ne_from_indices, ne_to_indices, ne_other_indices


def reconstruct_entities(list1, spaces, indices):
    entities = []
    for start, end in indices:
        entity_tokens = list1[start:end]
        entity_spaces = spaces[start : end - 1] if end > start else []
        entity = reconstruct_sentence(entity_tokens, entity_spaces)
        entities.append(entity)
    return entities

def get_mask_order(sentence: str):
    words = sentence.split()
    result = []
    for word in words:
        if '__NE_FROM__' in word or '__NE_TO__' in word or '__NE_OTHER__' in word:
            result.append(word)
    return result


def reconstruct_masked_sentence(
    original_sentence: str, ne_from: list[str], ne_to: list[str], ne_other: list[str], mask_order: list[str]
):
    masked_sentence = original_sentence
    q_ne_from = ne_from.copy()
    q_ne_to = ne_to.copy()
    q_ne_other = ne_other.copy()
    q_mask_order = mask_order.copy()
    while len(q_mask_order) > 0:
        #print(masked_sentence, q_ne_from, q_ne_to, q_ne_other, q_mask_order)
        if q_mask_order[0] == "__NE_FROM__":
            assert masked_sentence.replace(q_ne_from[0], '__NE_FROM__', 1) != masked_sentence
            masked_sentence = masked_sentence.replace(q_ne_from[0], '__NE_FROM__', 1)
            q_ne_from.pop(0)
        elif q_mask_order[0] == "__NE_TO__":
            assert masked_sentence.replace(q_ne_to[0], '__NE_TO__', 1) != masked_sentence
            masked_sentence = masked_sentence.replace(q_ne_to[0], '__NE_TO__', 1)
            q_ne_to.pop(0)
        elif q_mask_order[0] == "__NE_OTHER__":
            assert masked_sentence.replace(q_ne_other[0], '__NE_OTHER__', 1) != masked_sentence
            masked_sentence = masked_sentence.replace(q_ne_other[0], '__NE_OTHER__', 1)
            q_ne_other.pop(0)
        q_mask_order.pop(0)
    return masked_sentence

In [6]:
def preprocess(file_path):
    processed_data = []
    for _, row in pd.read_json(file_path).iterrows():
        original_text = fix_text(row["originalText"])
        for relation in row["relations"].values():
            masked_text = fix_text(relation["x"])
            if "__NE_FROM__" in masked_text and "__NE_TO__" in masked_text:
                processed_data.append(
                    {
                        "original_text": original_text,
                        "masked_text": masked_text,
                        "label": relation["y"],
                    }
                )
    df_processed = pd.DataFrame(processed_data)
    final_data = []
    for _, row in tqdm(df_processed.iterrows(), total=len(df_processed)):
        o_text_list, o_text_spaces = whitespace_tokenizer(row["original_text"])
        m_text_list, _ = whitespace_tokenizer(row["masked_text"])
        from_indices, to_indices, other_indices = find_ne_parts(
            o_text_list, m_text_list
        )
        NE_FROM = reconstruct_entities(o_text_list, o_text_spaces, from_indices)
        NE_TO = reconstruct_entities(o_text_list, o_text_spaces, to_indices)
        NE_OTHER = reconstruct_entities(o_text_list, o_text_spaces, other_indices)
        if len(NE_FROM) != 1 or len(NE_TO) != 1:
            continue
        if NE_FROM[0] == "" or NE_TO[0] == "" or NE_FROM[0] == NE_TO[0]:
            continue
        if len(split_sentences(row["original_text"])) > 1:
            continue
        mask_order = get_mask_order(row['masked_text'])
        try:
            row_masked_text = reconstruct_masked_sentence(
                row["original_text"], NE_FROM, NE_TO, NE_OTHER, mask_order
            )
        except Exception:
            continue
        final_data.append(
            {
                "original_text": row["original_text"],
                "masked_text": row_masked_text,
                "label": row["label"],
                "NE_FROM": NE_FROM[0],
                "NE_TO": NE_TO[0],
                "NE_OTHER": NE_OTHER,
            }
        )
        df_final = pd.DataFrame(final_data)
        df_final = df_final.groupby('original_text').filter(
            lambda x: (x['label'] != 0).any()
        )
    return df_final

In [7]:
df_train = preprocess(original_dataset_train_path)
df_train["source"] = "train"
df_valid = preprocess(original_dataset_valid_path)
df_valid["source"] = "valid"
df_test = preprocess(original_dataset_test_path)
df_test["source"] = "test"
concatenated_df = pd.concat([df_train, df_valid, df_test])
duplicates = concatenated_df[concatenated_df.duplicated("masked_text", keep=False)]
conflicting_duplicates = duplicates.groupby("masked_text").filter(
    lambda x: x["label"].nunique() > 1
)
df_ds = concatenated_df[
    ~concatenated_df.index.isin(conflicting_duplicates.index)
]
df_ds = df_ds.drop_duplicates(subset=["masked_text"], keep="first")
ds = DatasetDict(
    {
        "train": Dataset.from_pandas(
            df_ds[df_ds["source"] == "train"]
        ),
        "valid": Dataset.from_pandas(
            df_ds[df_ds["source"] == "valid"]
        ),
        "test": Dataset.from_pandas(
            df_ds[df_ds["source"] == "test"]
        ),
    }
)

  0%|          | 0/4934 [00:00<?, ?it/s]

  0%|          | 0/758 [00:00<?, ?it/s]

  0%|          | 0/1441 [00:00<?, ?it/s]

In [8]:
df_ds

Unnamed: 0,original_text,masked_text,label,NE_FROM,NE_TO,NE_OTHER,source
2,Officials of the Naval Air Systems Command at ...,Officials of the __NE_FROM__ at Patuxent River...,3,Naval Air Systems Command,Cobham plc Advanced Electronic Solutions,[],train
3,UTC Aerospace Systems has been selected by The...,__NE_FROM__ has been selected by The __NE_OTHE...,2,UTC Aerospace Systems,Boeing,[Boeing Co.],train
4,UTC Aerospace Systems has been selected by The...,__NE_OTHER__ has been selected by The __NE_FRO...,0,Boeing Co.,Boeing,[UTC Aerospace Systems],train
5,UTC Aerospace Systems has been selected by The...,__NE_FROM__ has been selected by The __NE_TO__...,2,UTC Aerospace Systems,Boeing Co.,[Boeing],train
6,Tenneco Automotive said Tuesday one of its uni...,__NE_FROM__ said Tuesday one of its units had ...,4,Tenneco Automotive,Minuzzi,[],train
...,...,...,...,...,...,...,...
1257,Lockheed Martin received a $769.5 million modi...,__NE_OTHER__ received a $769.5 million modific...,0,Navy,Defense,"[Lockheed Martin, USAF, USMC]",test
1258,Lockheed Martin received a $769.5 million modi...,__NE_OTHER__ received a $769.5 million modific...,0,USAF,Defense,"[Lockheed Martin, USMC, Navy]",test
1261,"In April, HAECO Cabin Solutions signed a contr...","In April, __NE_FROM__ signed a contract with _...",3,HAECO Cabin Solutions,Airbus,[],test
1262,HITCO to Supply Composite Components for Boeing,__NE_FROM__ to Supply Composite Components for...,2,HITCO,Boeing,[],test


In [9]:
len(df_ds["original_text"])

3722

In [10]:
len(df_ds["original_text"].unique())

1940

In [13]:
ds.save_to_disk(str(Path(cwd, "../../datasets/ManualDataset").resolve()))

Saving the dataset (0/1 shards):   0%|          | 0/2559 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/745 [00:00<?, ? examples/s]