In [2]:
import json
import numpy as np
import random
from tqdm.auto import tqdm
import itertools
import os
from copy import deepcopy
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
def build_dicts(entities):
    entity2ind = dict()
    ind2entity = []
    for i in range(len(entities)):
        entity = entities[i]
        if not (entity in ind2entity):
            ind2entity.append(entity)
            entity2ind[entity] = len(ind2entity) - 1
    return ind2entity, entity2ind

def choose(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    if num >= len(arr):
        return arr
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    return [arr[i] for i in rand_inds]
    
def split(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    train, test = [], []
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    for i in tqdm(range(len(arr))):
        if i in rand_inds:
            train.append(arr[i])
        else:
            test.append(arr[i])
    return [train, test]

def form_items(c, t):
    input_text = "".join(c)
    target_text = input_text + "".join([t, "</a>"])
    item = {
        "input_text": input_text,
        "target_text": target_text
    }
    return item

In [39]:
with open("/home/jinho/repos/GrokkedTransformer/data/composition.2000.200.9.0-controlled/train.json") as f:
    train_items = json.load(f)
    
print(len(train_items))

inferred_fact_num = 0
for item in tqdm(train_items):
    temp = item['target_text'].strip("><").split("><")
    if len(temp) == 4:
        continue
    inferred_fact_num += 1
    
    
print(inferred_fact_num)

382000


100%|██████████| 382000/382000 [00:00<00:00, 1523078.86it/s]

342000





In [40]:
def build_dataset(num_entities, num_relations, out_degree=20, split_train_inferred=False):
 
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # maps a head entity to a list of (r, t) pairs
    atomic_facts = []
    atomics = []

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    if not split_train_inferred:
        inferred_facts = []
        for ent in tqdm(entities):
            for (r1, b) in atomic_dict[ent]:
                for (r2, t) in atomic_dict[b]:
                    inferred_facts.append(form_items([ent, r1, r2], t))
        return entities, relations, atomic_facts, inferred_facts
    
    # split ID/OOD
    OOD_ratio = 0.05
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))
    OOD_facts, ID_facts = set(OOD_facts), set(ID_facts)
    only_used_ID_facts = random.sample(list(ID_facts), 2000)
    o2_ID_facts = only_used_ID_facts[:1000]
    o1_ID_facts = only_used_ID_facts[1000:]
    
    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]
    o1_id_atomic_facts = [form_items([h, r], t) for (h,r,t) in o1_ID_facts]
    o2_id_atomic_facts = [form_items([h, r], t) for (h,r,t) in o2_ID_facts]

    train_inferred, test_inferred_iid, test_inferred_iid_o2_error, test_inferred_iid_o1_error, test_inferred_ood, test_io, test_oi = [], [], [], [], [], [], []
    for ent in tqdm(entities):
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:
                if (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts:
                    test_inferred_ood.append(form_items([ent, r1, r2], t))
                elif (ent, r1, b) in ID_facts and (b, r2, t) in OOD_facts:
                    test_io.append(form_items([ent, r1, r2], t))
                elif (ent, r1, b) in OOD_facts and (b, r2, t) in ID_facts:
                    test_oi.append(form_items([ent, r1, r2], t))
                elif (ent, r1, b) in o2_ID_facts:
                    test_inferred_iid_o2_error.append(form_items([ent, r1, r2], t))
                elif (b, r2, t) in o1_ID_facts:
                    test_inferred_iid_o1_error.append(form_items([ent, r1, r2], t))
                else:
                    if np.random.uniform() > 0.005:
                        train_inferred.append(form_items([ent, r1, r2], t))
                    else:
                        test_inferred_iid.append(form_items([ent, r1, r2], t))

    nonsenses = []
    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_entities, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            e1, e2, e3 = ind2entity[i], ind2entity[row_idx], ind2entity[col_idx]
            nonsenses.append((e1,e2,e3))
            
    for i in tqdm(range(num_relations)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_relations)  # pick some random tail entity for each selected (h,r)
            r1, r2, r3 = ind2relation[i], ind2relation[row_idx], ind2relation[col_idx]
            nonsenses.append((r1,r2,r3))
            
    nonsenses = set(nonsenses)
    nonsenses_facts = [form_items([t1, t2], t3) for (t1,t2,t3) in nonsenses]
    
    return entities, relations, id_atomic_facts, ood_atomic_facts, o1_id_atomic_facts, o2_id_atomic_facts, train_inferred, test_inferred_iid, test_inferred_iid_o2_error, test_inferred_iid_o1_error, test_inferred_ood, test_io, test_oi, nonsenses_facts


NUM_ENTITY_IN = 2000
NUM_RELATION = 200

# entities, relations, atomic_facts, inferred_facts = build_dataset(NUM_ENTITY_IN, NUM_RELATION, split_train_inferred=False)
train_entities, train_relations, id_atomic_facts, ood_atomic_facts, o1_id_atomic_facts, o2_id_atomic_facts, train_inferred, test_inferred_iid, test_inferred_iid_o2_error, test_inferred_iid_o1_error, test_inferred_ood, test_io, test_oi, nonsenses = build_dataset(NUM_ENTITY_IN, NUM_RELATION, split_train_inferred=True)

  0%|          | 0/2000 [00:00<?, ?it/s]

100%|██████████| 2000/2000 [00:00<00:00, 13374.72it/s]
100%|██████████| 40000/40000 [00:00<00:00, 58432.54it/s]
100%|██████████| 2000/2000 [00:38<00:00, 51.57it/s]
100%|██████████| 2000/2000 [00:00<00:00, 12925.62it/s]
100%|██████████| 200/200 [00:00<00:00, 17940.86it/s]


In [41]:
vocab = []
vocab = vocab + train_entities + train_relations
# special tokens
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]
assert len(vocab) == len(set(vocab))
print("vocab size:", len(vocab))

vocab size: 2206


In [42]:
# id_atomic_facts, ood_atomic_facts, o1_id_atomic_facts, o2_id_atomic_facts, train_inferred, test_inferred_iid, test_inferred_iid_o2_error, test_inferred_iid_o1_error, test_inferred_ood, test_io, test_oi, nonsenses = build_dataset(NUM_ENTITY_IN, NUM_RELATION, split_train_inferred=True)

test_size = 3000
id_atomic_facts_ds = choose(id_atomic_facts, test_size)
ood_atomic_facts_ds = choose(ood_atomic_facts, test_size)
test_inferred_iid_ds = choose(test_inferred_iid, test_size)
test_inferred_iid_controlled_ds = choose(test_inferred_iid_o2_error, test_size // 2) + choose(test_inferred_iid_o1_error, test_size // 2)
test_inferred_ood_ds = choose(test_inferred_ood, test_size)
test_io_ds = choose(test_io, test_size)
test_oi_ds = choose(test_oi, test_size)
nonsense_ds = choose(nonsenses, test_size)

all_atomics = id_atomic_facts + ood_atomic_facts
len(all_atomics)

40000

In [43]:
# downsampling train_inferred
# for phi in [18.0,12.6,9.0,7.2,5.4,3.6]:
dataset_name = "composition.{}.{}.{}-controlled".format(NUM_ENTITY_IN, NUM_RELATION, "inf")
os.makedirs("data/{}".format(dataset_name), exist_ok=True)
train_inferred_ds = choose(train_inferred, 382000)

probes = []
for item in id_atomic_facts_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "id_atomic"

for item in ood_atomic_facts_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "ood_atomic"

for item in choose(train_inferred_ds, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'train_inferred'

for item in test_inferred_iid_ds:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'test_inferred_iid'

for item in test_inferred_iid_controlled_ds:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'test_inferred_iid_controlled'

for item in test_inferred_ood_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "test_inferred_ood"
    
for item in test_io_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "test_IO"

for item in test_oi_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "test_OI"
    
for item in nonsense_ds:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "test_nonsenses"

with open("data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_inferred_ds, f)
with open("data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_inferred_ood_ds, f)
with open("data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)
with open("data/{}/o1_id_atomic_facts.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(o1_id_atomic_facts, f)
with open("data/{}/o2_id_atomic_facts.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(o2_id_atomic_facts, f)
# add vocab
with open("data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)
    

In [None]:
with open("/home/jinho/repos/GrokkedTransformer/data/composition.2000.200.inf-controlled/train.json") as f:
    train_data = json.load(f)
    
atomic_num = 0
inferred_num = 0
for item in tqdm(train_data):
    temp = item['target_text'].strip("><").split("><")
    if len(temp) == 4:
        print(temp)
    else:
        inferred_num += 1
        
print(atomic_num)
print(inferred_num)


NameError: name 'train_items' is not defined