In [6]:
import json
with open('zero_rel_type_counts.json', 'r') as f:
    zero_rel_type_counts = json.load(f)

intersection_labels = set()

# WikiZSL
with open('wiki_zsl_all.jsonl', 'r') as f:
    wiki_zsl_all = [json.loads(line) for line in f]

wiki_zsl_rel_type_counts = {}
for item in wiki_zsl_all:
    for relation in item['relations']:
        relation_text = relation['relation_text']
        wiki_zsl_rel_type_counts[relation_text] = wiki_zsl_rel_type_counts.get(relation_text, 0) + 1

wiki_zsl_intersection_labels = set(zero_rel_type_counts.keys()).intersection(wiki_zsl_rel_type_counts.keys())
print(f"\nWikiZSL: {wiki_zsl_intersection_labels}")
intersection_labels.update(wiki_zsl_intersection_labels)

# NYT
with open('nyt/nyt_all.jsonl', 'r') as f:
    nyt_all = [json.loads(line) for line in f]

nyt_rel_type_counts = {}
for item in nyt_all:
    for relation in item['relations']:
        relation_text = relation['relation_text']
        nyt_rel_type_counts[relation_text] = nyt_rel_type_counts.get(relation_text, 0) + 1

nyt_intersection_labels = set(zero_rel_type_counts.keys()).intersection(nyt_rel_type_counts.keys())
print(f"\nNYT: {nyt_intersection_labels}")
intersection_labels.update(nyt_intersection_labels)


# FewRel
with open('few_rel_all.jsonl', 'r') as f:
    fewrel_all = [json.loads(line) for line in f]

fewrel_type_counts = {}
for item in fewrel_all:
    for relation in item['relations']:
        relation_text = relation['relation_text']
        fewrel_type_counts[relation_text] = fewrel_type_counts.get(relation_text, 0) + 1

fewrel_intersection_labels = set(zero_rel_type_counts.keys()).intersection(fewrel_type_counts.keys())
print(f"\nFewRel: {fewrel_intersection_labels}")
intersection_labels.update(fewrel_intersection_labels)


# Re-DocRED
redocred_all = []
for file in ['redocred_train.jsonl', 'redocred_dev.jsonl', 'redocred_test.jsonl']:
    with open(file, 'r') as f:
        redocred_all.extend([json.loads(line) for line in f])

redocred_type_counts = {}
for item in redocred_all:
    for relation in item['relations']:
        relation_text = relation['relation_text']
        redocred_type_counts[relation_text] = redocred_type_counts.get(relation_text, 0) + 1

redocred_intersection_labels = set(zero_rel_type_counts.keys()).intersection(redocred_type_counts.keys())
print(f"\nRe-DocRED: {redocred_intersection_labels}")
intersection_labels.update(redocred_intersection_labels)


print(f"\nIntersection labels: {intersection_labels}\n")
print(len(intersection_labels))


WikiZSL: {'place of publication', 'place of birth', 'facet of', 'home venue', 'librettist', 'instrumentation', 'continent', 'shares border with', 'military rank', 'manifestation of', 'executive producer', 'given name', 'chief executive officer', 'country of citizenship', 'director of photography', 'subclass of', 'editor', 'headquarters location', 'highest point', 'worshipped by', 'conferred by', 'operating system', 'date of death', 'use', 'religion', 'father', 'official residence', 'instance of', 'studies', 'educated at', 'residence', 'represents', 'screenwriter', 'platform', 'family name', 'diplomatic relation', 'participant of', 'head coach', 'crosses', 'allegiance', 'author', 'illustrator', 'replaces', 'presenter', 'cathedral', 'conflict', 'signatory', 'student', 'employer', 'influenced by', 'significant event', 'mother', 'opposite of', 'genre', 'maintained by', 'head of government', 'performer', 'place of burial', 'drafted by', 'position held', 'diocese', 'candidate', 'occupation'

In [7]:
from tqdm import tqdm

save_path = 'zero_rel_all.jsonl'
final_save_path = 'zero_rel_all_diff.jsonl'
skipped_items = 0
skipped_relations = 0
with open(save_path, 'r') as fr, open(final_save_path, 'w') as fw:
    for line in tqdm(fr):
        item = json.loads(line)
        relations = item['relations']
        new_relations = []
        for relation in relations:
            rel_text = relation['relation_text']
            if rel_text not in intersection_labels:
                new_relations.append(relation)
            else:
                skipped_relations += 1
        
        item['relations'] = new_relations

        # Write the updated item to the new file
        if len(new_relations) > 0:
            fw.write(json.dumps(item) + '\n')
        else:
            skipped_items += 1

print(f'To not mingle with benchmark datasets, we skipped {skipped_items} items and {skipped_relations} relations')

63493it [03:58, 266.33it/s]

To not mingle with benchmark datasets, we skipped 2 items and 1355104 relations





In [8]:
import json
import tqdm

pbar = tqdm.tqdm()
with open('zero_rel_all_diff.jsonl', 'r') as f:
    zero_rel_wiki_diff = (json.loads(line) for line in f)

    zero_rel_type_counts = {}
    for item in zero_rel_wiki_diff:
        for relation in item['relations']:
            relation_text = relation['relation_text']
            zero_rel_type_counts[relation_text] = zero_rel_type_counts.get(relation_text, 0) + 1
        pbar.update(1)

with open('wiki_zsl_all.jsonl', 'r') as f:
    wiki_zsl_all = [json.loads(line) for line in f]

wiki_zsl_rel_type_counts = {}
for item in wiki_zsl_all:
    for relation in item['relations']:
        relation_text = relation['relation_text']
        wiki_zsl_rel_type_counts[relation_text] = wiki_zsl_rel_type_counts.get(relation_text, 0) + 1

intersection_labels = set(zero_rel_type_counts.keys()).intersection(wiki_zsl_rel_type_counts.keys())
print(intersection_labels)

160it [00:00, 680.69it/s]

63415it [02:21, 774.49it/s]

set()
