In [1]:
import os
import json

parent_dir = 'supervised/supervised/'
file_list = [os.path.join(parent_dir, file) for file in os.listdir(parent_dir)]

In [2]:
raw_lines = []
for file in file_list:
    with open(file, encoding='utf-8') as f:
        lines = f.readlines()
    lines[-1] = lines[-1] + '\n'
    lines.extend(['\n'])
    lines = [line.strip() for line in lines]
    raw_lines.extend(lines)

all_sentences = []
beg, end = 0, 0
while end < len(raw_lines):
        if raw_lines[end]:
            end += 1
        else:
            sentence = raw_lines[beg:end]
            all_sentences.append(sentence)
            end += 1
            beg = end
print(len(all_sentences))

188239


In [3]:
washed_sentences = set()
no_entities_sentences = set()
for sentence in all_sentences:
    O_count = sum([1 if wt.endswith('\tO') else 0 for wt in sentence])
    if O_count != len(sentence):
        washed_sentences.add('\n'.join(sentence))
    else:
        no_entities_sentences.add('\n'.join(sentence))
print(len(washed_sentences))
print(len(no_entities_sentences))
all_sentences = list(washed_sentences)

154387
23482


In [4]:
coarse_classes = set()
fine_classes = set()
for k in range(len(all_sentences)):
    wts = all_sentences[k].split('\n')
    for i in range(len(wts)):
        word_tag = wts[i]
        tokens = word_tag.split('\t')
        if tokens[1] != 'O':
            if '/' in tokens[1]:
                tokens[1] = tokens[1].replace('/', ',')
                wts[i] = '\t'.join(tokens)
            fine_classes.add(tokens[1])
            coarse_classes.add(tokens[1].split('-')[0])
    all_sentences[k] = '\n'.join(wts)
print(f'coarse-grained classes: {coarse_classes}; \nfine-grained classes: {fine_classes}')

coarse_dict = {}
fine_dict = {}

coarse_sid_dict = {}
fine_sid_dict = {}

coarse_sentences = ['\n'.join([wt.replace(wt.split('\t')[1], wt.split('\t')[1].split('-')[0]) for wt in sentence.split('\n')]) for sentence in all_sentences]
print(coarse_sentences[0])

for coarse_class in coarse_classes:
    sentences = []
    sid_list = []
    out_sid = 0
    for sentence in coarse_sentences:
        wts = sentence.split('\n')
        for word_tag in wts:
            if word_tag.endswith(coarse_class):
                sentences.append(sentence)
                sid_list.append(out_sid)
                break
        out_sid += 1
    coarse_dict[coarse_class] = sentences
    coarse_sid_dict[coarse_class] = sid_list

for fine_class in fine_classes:
    sentences = []
    sid_list = []
    out_sid = 0
    for sentence in all_sentences:
        wts = sentence.split('\n')
        for word_tag in wts:
            if word_tag.endswith(fine_class):
                sentences.append(sentence)
                sid_list.append(out_sid)
                break
        out_sid += 1
    fine_dict[fine_class] = sentences
    fine_sid_dict[fine_class] = sid_list

coarse-grained classes: {'building', 'product', 'art', 'location', 'organization', 'other', 'person', 'event'}; 
fine-grained classes: {'person-scholar', 'location-bodiesofwater', 'product-airplane', 'building-library', 'person-politician', 'person-athlete', 'other-chemicalthing', 'product-other', 'other-biologything', 'organization-showorganization', 'art-film', 'organization-other', 'building-other', 'organization-politicalparty', 'event-election', 'building-hotel', 'location-other', 'art-music', 'other-livingthing', 'product-game', 'building-sportsfacility', 'other-law', 'event-sportsevent', 'event-protest', 'building-restaurant', 'art-painting', 'other-language', 'location-road,railway,highway,transit', 'product-weapon', 'product-car', 'other-educationaldegree', 'person-artist,author', 'person-other', 'person-soldier', 'other-disease', 'other-astronomything', 'other-god', 'person-director', 'product-software', 'other-currency', 'art-writtenart', 'building-theater', 'organization-co

In [5]:
# remove duplicated sentences among different classes
key_list = list(coarse_sid_dict.keys())
for i in range(len(key_list)):
    a_sids = set(coarse_sid_dict[key_list[i]])
    for j in range(i + 1, len(key_list)):
        b_sids = set(coarse_sid_dict[key_list[j]])
        c = a_sids.intersection(b_sids)
        m, n = len(a_sids), len(b_sids)
        for sid in c:
            removed_sentence = coarse_sentences[sid]
            if m > n:
                coarse_dict[key_list[i]].remove(removed_sentence)
                coarse_sid_dict[key_list[i]].remove(sid)
                a_sids.remove(sid)
                m -= 1
            else:
                coarse_dict[key_list[j]].remove(removed_sentence)
                coarse_sid_dict[key_list[j]].remove(sid)
                b_sids.remove(sid)
                n -= 1

total_sentence_num = 0

# remove duplicated sentences among different classes
key_list = list(fine_sid_dict.keys())
for i in range(len(key_list)):
    a_sids = set(fine_sid_dict[key_list[i]])
    for j in range(i + 1, len(key_list)):
        b_sids = set(fine_sid_dict[key_list[j]])
        c = a_sids.intersection(b_sids)
        m, n = len(a_sids), len(b_sids)
        for sid in c:
            removed_sentence = all_sentences[sid]
            if m > n:
                fine_dict[key_list[i]].remove(removed_sentence)
                fine_sid_dict[key_list[i]].remove(sid)
                a_sids.remove(sid)
                m -= 1
            else:
                fine_dict[key_list[j]].remove(removed_sentence)
                fine_sid_dict[key_list[j]].remove(sid)
                b_sids.remove(sid)
                n -= 1

coarse_tag_count, fine_tag_count = {}, {}
for c in coarse_classes:
    coarse_tag_count[c] = len(coarse_dict[c])
for c in fine_classes:
    fine_tag_count[c] = len(fine_dict[c])
print(coarse_tag_count)
print(fine_tag_count)

{'building': 14709, 'product': 15114, 'art': 12676, 'location': 29811, 'organization': 25248, 'other': 20353, 'person': 22352, 'event': 14124}
{'person-scholar': 2067, 'location-bodiesofwater': 2872, 'product-airplane': 2079, 'building-library': 1276, 'person-politician': 4417, 'person-athlete': 4470, 'other-chemicalthing': 2116, 'product-other': 3158, 'other-biologything': 2677, 'organization-showorganization': 1952, 'art-film': 1745, 'organization-other': 5981, 'building-other': 4482, 'organization-politicalparty': 2614, 'event-election': 719, 'building-hotel': 1079, 'location-other': 3560, 'art-music': 2099, 'other-livingthing': 1926, 'product-game': 1319, 'building-sportsfacility': 1580, 'other-law': 1782, 'event-sportsevent': 4065, 'event-protest': 675, 'building-restaurant': 770, 'art-painting': 213, 'other-language': 1983, 'location-road,railway,highway,transit': 3523, 'product-weapon': 1474, 'product-car': 1775, 'other-educationaldegree': 1175, 'person-artist,author': 3172, 'pe

In [None]:
corse_distribution = {}
for c in coarse_dict:
    distribution = {}
    for s in coarse_dict[c]:
        t_set = set()
        for w_t in s.split('\n'):
            t = w_t.split('\t')[1] 
            if t != 'O':
                t_set.add(t.split('-')[0])
        for t in t_set:
            distribution[t] = distribution.get(t, 0) + 1
    corse_distribution[c] = distribution
with open('./continual/coarse/distribution.json', encoding='utf-8', mode='w') as f:
    json.dump(corse_distribution, f)

fine_distribution = {}
for c in fine_dict:
    distribution = {}
    for s in fine_dict[c]:
        t_set = set()
        for w_t in s.split('\n'):
            t = w_t.split('\t')[1] 
            #\ .split('-')[0]
            if t != 'O':
                t_set.add(t)
        for t in t_set:
            distribution[t] = distribution.get(t, 0) + 1
    fine_distribution[c] = distribution

with open('./continual/fine/distribution.json', encoding='utf-8', mode='w') as f:
    json.dump(fine_distribution, f)

In [6]:
del washed_sentences

In [7]:
import random

In [8]:
no_coarse_dict, no_fine_dict = {}, {}
for coarse in coarse_classes:
    sample_ids = coarse_sid_dict[coarse]
    no_samples = []
    for sample_id in sample_ids:
        sample = coarse_sentences[sample_id]
        wts = sample.split('\n')
        for i in range(len(wts)):
            word_tag = wts[i].split('\t')
            if word_tag[1] != 'O' and word_tag[1] != coarse:
                word_tag[1] = 'O'
                wts[i] = '\t'.join(word_tag)
        no_samples.append('\n'.join(wts))
    no_coarse_dict[coarse] = no_samples
for fine in fine_classes:
    sample_ids = fine_sid_dict[fine]
    no_samples = []
    for sample_id in sample_ids:
        sample = all_sentences[sample_id]
        wts = sample.split('\n')
        for i in range(len(wts)):
            word_tag = wts[i].split('\t')
            if word_tag[1] != 'O' and word_tag[1] != fine:
                word_tag[1] = 'O'
                wts[i] = '\t'.join(word_tag)
        no_samples.append('\n'.join(wts))
    no_fine_dict[fine] = no_samples

In [12]:
no_coarse_train_samples, no_coarse_valid_samples, no_coarse_test_samples = {}, {}, {}
no_coarse_train_indices, no_coarse_valid_indices, no_coarse_test_indices = {}, {}, {}
no_fine_train_samples, no_fine_valid_samples, no_fine_test_samples = {}, {}, {}
no_fine_train_indices, no_fine_valid_indices, no_fine_test_indices = {}, {}, {}

for c in coarse_classes:
    sample_ids = coarse_sid_dict[c]
    train_sample_id_indices, valid_sample_id_indices = set(), set()
    train_count, valid_count = 0, 0
    while train_count < 0.7 * coarse_tag_count[c]:
        sample_id_idx = random.choice(range(len(sample_ids)))
        if sample_id_idx not in train_sample_id_indices:
            train_sample_id_indices.add(sample_id_idx)
            train_count += 1
    while valid_count < 0.1 * coarse_tag_count[c]:
        sample_id_idx = random.choice(range(len(sample_ids)))
        if sample_id_idx not in train_sample_id_indices and sample_id_idx not in valid_sample_id_indices:
            valid_sample_id_indices.add(sample_id_idx)
            valid_count += 1
    test_sample_id_indices = set(range(len(sample_ids))).difference(train_sample_id_indices.union(valid_sample_id_indices))
    no_coarse_train_samples[c] = [no_coarse_dict[c][sample_id_idx] for sample_id_idx in train_sample_id_indices]
    no_coarse_valid_samples[c] = [no_coarse_dict[c][sample_id_idx] for sample_id_idx in valid_sample_id_indices]
    no_coarse_test_samples[c] = [no_coarse_dict[c][sample_id_idx] for sample_id_idx in test_sample_id_indices]
    no_coarse_train_indices[c] = train_sample_id_indices
    no_coarse_valid_indices[c] = valid_sample_id_indices
    no_coarse_test_indices[c] = test_sample_id_indices
for c in coarse_tag_count:
    print(f'{c} : train : {len(no_coarse_train_samples[c])}; valid: {len(no_coarse_valid_samples[c])}; test : {len(no_coarse_test_samples[c])}; total : {coarse_tag_count[c]}; ratio : {len(no_coarse_train_samples[c]) / coarse_tag_count[c]}')

    
print('================================================')

    
for c in fine_classes:
    sample_ids = fine_sid_dict[c]
    train_sample_id_indices, valid_sample_id_indices = set(), set()
    train_count, valid_count = 0, 0
    while train_count < 0.7 * fine_tag_count[c]:
        sample_id_idx = random.choice(range(len(sample_ids)))
        if sample_id_idx not in train_sample_id_indices:
            train_sample_id_indices.add(sample_id_idx)
            train_count += 1
    while valid_count < 0.1 * fine_tag_count[c]:
        sample_id_idx = random.choice(range(len(sample_ids)))
        if sample_id_idx not in train_sample_id_indices and sample_id_idx not in valid_sample_id_indices:
            valid_sample_id_indices.add(sample_id_idx)
            valid_count += 1
    test_sample_id_indices = set(range(len(sample_ids))).difference(train_sample_id_indices.union(valid_sample_id_indices))
    no_fine_train_samples[c] = [no_fine_dict[c][sample_id_idx] for sample_id_idx in train_sample_id_indices]
    no_fine_valid_samples[c] = [no_fine_dict[c][sample_id_idx] for sample_id_idx in valid_sample_id_indices]
    no_fine_test_samples[c] = [no_fine_dict[c][sample_id_idx] for sample_id_idx in test_sample_id_indices]
    no_fine_train_indices[c] = train_sample_id_indices
    no_fine_valid_indices[c] = valid_sample_id_indices
    no_fine_test_indices[c] = test_sample_id_indices
for c in fine_tag_count:
    print(f'{c} : train : {len(no_fine_train_samples[c])}; valid: {len(no_fine_valid_samples[c])}; test : {len(no_fine_test_samples[c])}; total : {fine_tag_count[c]}; ratio : {len(no_fine_train_samples[c]) / fine_tag_count[c]}')

building : train : 10297; valid: 1471; test : 2941; total : 14709; ratio : 0.7000475899109389
product : train : 10580; valid: 1512; test : 3022; total : 15114; ratio : 0.7000132327643245
art : train : 8874; valid: 1268; test : 2534; total : 12676; ratio : 0.7000631113916062
location : train : 20868; valid: 2982; test : 5961; total : 29811; ratio : 0.7000100633994163
organization : train : 17674; valid: 2525; test : 5049; total : 25248; ratio : 0.7000158428390367
other : train : 14248; valid: 2036; test : 4069; total : 20353; ratio : 0.7000442195253771
person : train : 15647; valid: 2236; test : 4469; total : 22352; ratio : 0.7000268432355047
event : train : 9887; valid: 1413; test : 2824; total : 14124; ratio : 0.7000141602945341
person-scholar : train : 1447; valid: 207; test : 413; total : 2067; ratio : 0.7000483792936623
location-bodiesofwater : train : 2011; valid: 288; test : 573; total : 2872; ratio : 0.700208913649025
product-airplane : train : 1456; valid: 208; test : 415; tota

In [13]:
coarse_train_samples, coarse_test_samples = {}, {}
fine_train_samples, fine_test_samples = {}, {}
for c in coarse_classes:
    train_samples = [coarse_dict[c][i] for i in no_coarse_train_indices[c]]
    coarse_train_samples[c] = train_samples
for c in coarse_tag_count:
    print(f'{c} : train : {len(coarse_train_samples[c])}; total : {coarse_tag_count[c]}; ratio : {len(coarse_train_samples[c]) / coarse_tag_count[c]}')
# remained_samples = set()
for c in fine_classes:
    train_samples = [fine_dict[c][i] for i in no_fine_train_indices[c]]
    fine_train_samples[c] = train_samples
#     remained_samples = remained_samples.union(set([fine_dict[c][i] for i in no_fine_test_indices[c]]))
for c in fine_tag_count:
    print(f'{c} : train : {len(fine_train_samples[c])}; total : {fine_tag_count[c]}; ratio : {len(fine_train_samples[c]) / fine_tag_count[c]}')

# for c in fine_tag_count:
#     train_samples = fine_train_samples[c]
#     train_dist = {}
#     for train_sample in train_samples:
#         tags = set(wt .split('\t')[1] for wt in train_sample.split('\n'))
#         tags.remove('O')
#         for tag in tags:
#             train_dist[tag] = train_dist.get(tag, 0) + 1
#     test_samples, test_tag_count = set(), {}
#     for train_tag in train_dist:
#         visited = set()
#         while test_tag_count.get(train_tag, 0) < (0.25 * train_dist[train_tag] if train_dist[train_tag] > 4 else 1):
#             test_sample = random.choice(list(remained_samples))
#             visited.add(test_sample)
#             if visited == remained_samples:
#                 print('No satisfied sample. Exit search.')
#                 break
#             if test_sample not in test_samples and test_sample not in train_samples:
#                 satisfied = True
#                 tags = set([wt.split('\t')[1] for wt in test_sample.split('\n')])
#                 tags.remove('O')
#                 if tags.issubset(train_dist.keys()):
#                     for t in tags:
#                         if test_tag_count.get(t, 0) + 1 > (0.25 * train_dist[t] if train_dist[t] > 4 else 1):
#                             satisfied = False
#                             break
#                 else:
#                     satisfied = False
#                 if satisfied:
#                     test_samples.add(test_sample)
#                     remained_samples.remove(test_sample)
#                     visited.remove(test_sample)
#                     test_tag_count[train_tag] = test_tag_count.get(train_tag, 0) + 1
#                     for t in tags:
#                         if t != 'O' and t != train_tag:
#                             test_tag_count[t] = test_tag_count.get(t, 0) + 1
#     fine_test_samples[c] = test_samples
#     print(f'{c} test samples found!')
# for c in tag_count:
#     print(f'{c} : test : {len(fine_test_samples[c])}; total : {fine_tag_count[c]}; ratio : {len(fine_test_samples[c]) / fine_tag_count[c]}')



building : train : 10297; total : 14709; ratio : 0.7000475899109389
product : train : 10580; total : 15114; ratio : 0.7000132327643245
art : train : 8874; total : 12676; ratio : 0.7000631113916062
location : train : 20868; total : 29811; ratio : 0.7000100633994163
organization : train : 17674; total : 25248; ratio : 0.7000158428390367
other : train : 14248; total : 20353; ratio : 0.7000442195253771
person : train : 15647; total : 22352; ratio : 0.7000268432355047
event : train : 9887; total : 14124; ratio : 0.7000141602945341
person-scholar : train : 1447; total : 2067; ratio : 0.7000483792936623
location-bodiesofwater : train : 2011; total : 2872; ratio : 0.700208913649025
product-airplane : train : 1456; total : 2079; ratio : 0.7003367003367004
building-library : train : 894; total : 1276; ratio : 0.700626959247649
person-politician : train : 3092; total : 4417; ratio : 0.7000226398007697
person-athlete : train : 3129; total : 4470; ratio : 0.7
other-chemicalthing : train : 1482; tot

In [14]:
def save_file(parent_dir, samples_dict, mode):
    for fname in samples_dict:
        path = os.path.join(parent_dir, fname)
        if not os.path.exists(path):
            os.makedirs(path)
        with open(os.path.join(path, mode+'.txt'), 'w', encoding='utf-8') as f:
            f.writelines('\n\n'.join(samples_dict[fname]))
save_file('./continual/coarse/non-overlapping', no_coarse_train_samples, mode='train')
save_file('./continual/coarse/non-overlapping', no_coarse_valid_samples, mode='dev')
save_file('./continual/coarse/non-overlapping', no_coarse_test_samples, mode='test')
save_file('./continual/fine/non-overlapping', no_fine_train_samples, mode='train')
save_file('./continual/fine/non-overlapping', no_fine_valid_samples, mode='dev')
save_file('./continual/fine/non-overlapping', no_fine_test_samples, mode='test')
save_file('./continual/fine/overlapping', fine_train_samples, mode='train')
save_file('./continual/coarse/overlapping', coarse_train_samples, mode='train')

# save_file('./continual/fine/overlapping', fine_test_samples, mode='test')