In [8]:
import argparse
from transformers import BertTokenizer
from collections import namedtuple
import random
import torch
import numpy as np
import re
import nltk
from neural_jacana.model import *

In [9]:
def preprocess_texts(texts):
    tokenized_texts = []
    for text in texts:
        tokenized_texts.append(nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', text)))
    return tokenized_texts

def get_unique_list(seq):
    seen = []
    return [x for x in seq if x not in seen and not seen.append(x)]

def check_inclusion(list1, list2):
    flag = False
    for i in list1:
        if i in list2:
            flag = True
    for i in list2:
        if i in list1:
            flag = True
    return flag

def merge_sent2_ids(align_id_pairs):
    '''
    merge sent2 ids aligned to sent1 ids.
    '''
    merged_sent2_align_id_pairs = []
    sorted_align_id_pairs = sorted(align_id_pairs, key=lambda x:(int(re.findall(r"\d+", x)[0]), int(re.findall(r"\d+", x)[1])))
    tuple_pairs = [(int(re.findall(r"\d+", i)[0]), int(re.findall(r"\d+", i)[1])) for i in sorted_align_id_pairs]
    for pair in tuple_pairs:
        keys = [i[0] for i in merged_sent2_align_id_pairs]
        keys_flatten = [x for row in keys for x in row]
        if pair[0] not in keys_flatten:
            merged_sent2_align_id_pairs.append(([pair[0]], [pair[1]]))
        else:
            ind_addval = [i for i in range(len(keys)) if pair[0] in keys[i]][0]
            merged_sent2_align_id_pairs[ind_addval][1].append(pair[1])
    return merged_sent2_align_id_pairs

def merge_sent1_ids(merged_sent2_align_id_pairs):
    '''
    merge sent1 ids having the same sent2 ids.
    '''
    merged_sent1_align_id_pairs = []
    dup_inds = []
    vals = [pair[1] for pair in merged_sent2_align_id_pairs]
    for pair in merged_sent2_align_id_pairs:
        dup_ind = [i for i, x in enumerate(vals) if x == pair[1]]
        if len(dup_ind) > 1:
            dup_inds.append(dup_ind)
    dup_inds = get_unique_list(dup_inds)

    if len(dup_inds) != 0: #if there are duplicate values in merged_sent2_align_id_pairs, they should be merged.
        keys_to_add = []
        for i in range(len(dup_inds)):
            key_to_add = []
            for j in range(len(merged_sent2_align_id_pairs)):
                if j in dup_inds[i]:
                    key_to_add.append(merged_sent2_align_id_pairs[j][0][0])
            if len(key_to_add) != 0:
                keys_to_add.append(key_to_add)
        
        pairs_to_add = []
        for i in range(len(dup_inds)):
            pairs_to_add.append((keys_to_add[i], merged_sent2_align_id_pairs[dup_inds[i][0]][1]))

        dup_inds_flatten = [x for row in dup_inds for x in row]
        for i in range(len(merged_sent2_align_id_pairs)):
            if i not in dup_inds_flatten:
                merged_sent1_align_id_pairs.append(merged_sent2_align_id_pairs[i])
        merged_sent1_align_id_pairs.extend(pairs_to_add)
        return merged_sent1_align_id_pairs
    
    else:
        return merged_sent2_align_id_pairs

def merge_align_ids_crossing(merged_ids):
    sent1_ids = [pair[0] for pair in merged_ids]
    sent2_ids = [pair[1] for pair in merged_ids]
    res = []
    added_sent1 = [0 for i in range(len(sent1_ids))]
    for i in range(len(sent1_ids)):
        sent2_ids_to_add = sent2_ids[i]
        sent1_correspond = sent1_ids[i]
        for j in range(i, len(sent1_ids)):
            if check_inclusion(sent1_ids[i], sent1_ids[j]) == True:
                if added_sent1[j] == 0 and i != j:
                    added_sent1[j] = 1
                    sent2_ids_to_add.extend(sent2_ids[j])
                    sent1_correspond.extend(sent1_ids[j])
                    sent1_correspond = get_unique_list(sent1_correspond)
        if len(sent2_ids_to_add) > 1 and added_sent1 == 0:
            res.append((sent1_correspond, sent2_ids_to_add))
            added_sent1[i] = 1
            #print(0)

    added_sent2 = [0 for i in range(len(sent2_ids))]
    for i in range(len(sent2_ids)):
        sent1_ids_to_add = sent1_ids[i]
        sent2_correspond = sent2_ids[i]
        for j in range(i, len(sent2_ids)):
            if check_inclusion(sent2_ids[i], sent2_ids[j]) == True:
                if added_sent2[j] == 0 and i != j:
                    added_sent2[j] = 1
                    sent1_ids_to_add.extend(sent1_ids[j])
                    sent2_correspond.extend(sent2_ids[j])
                    sent2_correspond = get_unique_list(sent2_correspond)
        if len(sent1_ids_to_add) > 1 and added_sent2[i] == 0:
            res.append((sent1_ids_to_add, sent2_correspond))
            added_sent2[i] = 1
            #print(1)

    for i in range(len(merged_ids)):
        if added_sent1[i] == 0 and added_sent2[i] == 0:
            res.append((sent1_ids[i], sent2_ids[i]))
            #print(2)
    #print(added_sent1, added_sent2)
    return res

def ids_to_words(merged_id_pairs, tokenized_sent1, tokenized_sent2):
    align_word_pairs = []
    for pair in merged_id_pairs:
        sent1_words = [tokenized_sent1[i] for i in pair[0]]
        sent2_words = [tokenized_sent2[i] for i in pair[1]]
        align_word_pairs.append((sent1_words, sent2_words))
    return align_word_pairs

In [211]:
a = ['0-0', '1-0', '2-0', '1-1', '2-2', '3-3']
a = ['0-0', '1-0', '1-1', '2-0', '2-2', '3-3', '3-4', '3-5', '4-4', '5-5', '6-6', '7-7']

In [212]:
merge_sent2_ids(a)

[([0], [0]),
 ([1], [0, 1]),
 ([2], [0, 2]),
 ([3], [3, 4, 5]),
 ([4], [4]),
 ([5], [5]),
 ([6], [6]),
 ([7], [7])]

In [215]:
merge_align_ids_crossing(merge_sent1_ids(merge_sent2_ids(a)))

[([0, 1, 2], [0, 1, 2]), ([3, 4, 5], [3, 4, 5]), ([6], [6]), ([7], [7])]

In [31]:
parser = argparse.ArgumentParser()
parser.add_argument("--batchsize", default=1, type=int)
parser.add_argument("--learning_rate", default=1e-5, type=float)
parser.add_argument("--max_epoch", default=6, type=int)
parser.add_argument("--max_span_size", default=4, type=int)
parser.add_argument("--max_seq_length", default=128, type=int)
parser.add_argument("--max_sent_length", default=70, type=int)
parser.add_argument("--seed", default=1234, type=int)
parser.add_argument("--dataset", default='mtref', type=str)
parser.add_argument("--sure_and_possible", default='True', type=str)
parser.add_argument("--distance_embedding_size", default=128, type=int)
parser.add_argument("--use_transition_layer", default='False', type=str, help='if False, will set transition score to 0.')
args = parser.parse_args(args=[])

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = NeuralWordAligner(args)
my_device = torch.device('cpu')
model = model.to(my_device)

checkpoint = torch.load('./neural_jacana/Checkpoint_sure_and_possible_True_dataset_mtref_batchsize_1_max_span_size_4_use_transition_layer_False_epoch_2_0.9150.pt', map_location=my_device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

#sources = ["Military experts say the line between combat is getting blurry.", "Their eyes are quite small, and their visual acuity is poor.", 
#            "According to Ledford, Northrop executives said they would build substantial parts of the bomber in Palmdale, creating about 1,500 jobs.",
#            "In return, Rollo swore fealty to Charles, converted to Christianity, and undertook to defend the northern region of France against the Incursions of other Viking groups.",
#            "A fee is the price one pays as remuneration for services, especially the Honorarium paid to a doctor, lawyer, consultant, or other member of a learned profession.",
#            "Thereafter the county's administration was conducted at Duns or Lauder until Greenlaw became the county town in 1596.",
#            "MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time.",
#            "As the largest sub-region in Mesoamerica, it encompassed a vast and varied landscape, from the mountainous regions of the Sierra Madre to the semi-arid plains of northern Yucatán.",
#            "Together they formed New Music Manchester, a group committed to contemporary music."]
#targets = ["Military experts say war is changing.", "Their eyes are very small, and they do not see well.",
#            "According to Ledford, Northrop said they would build most of the bomber parts in Palmdale. It would create 1,500 jobs.",
#            "Rollo swore to be loyal to Charles, then he changed his religion to Christianity. Rollo protected northern France by fighting Viking invaders.",
#            "A price one might pay for services is a called a fee.",
#            "After that, the county offices were at Duns or Lauder. In 1596 they moved to Greenlaw.",
#            "Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb.",
#            "As the largest sub-region in Mesoamerica, it was a vast and varied landscape.",
#            "Both of the formed a group committed to contemporary music called new music Manchester."]
#sources = ["As the largest sub-region in Mesoamerica, it encompassed a vast and varied landscape, from the mountainous regions of the Sierra Madre to the semi-arid plains of northern Yucatán."]
#targets = ["As the largest sub-region in Mesoamerica, it was a vast and varied landscape."]
#sources = ['Together they formed New Music Manchester, a group committed to contemporary music.']
#targets = ['Both of the formed a group committed to contemporary music called new music Manchester.']
sources = ['Characteristics Radar observations indicate a fairly pure iron-nickel composition.']
targets = ['A mainly pure Iron-Nickel composition was observed by radar.']
#sources = ["MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time."]
#targets = ["Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb."]
nltk.download('punkt')
tokenized_sources = preprocess_texts(sources)
tokenized_targets = preprocess_texts(targets)

data = []
example = namedtuple('example', 'ID, text_a, text_b, label')
for i, (tokenized_source, tokenized_target) in enumerate(zip(tokenized_sources, tokenized_targets)):
    data.append(example(i, ' '.join(tokenized_source), ' '.join(tokenized_target), '0-0'))
test_dataloader = create_Data_Loader(data_examples=data, args=args, set_type='test', batchsize=1, max_seq_length=128, tokenizer=tokenizer)

for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(my_device) for t in batch)
    input_ids_a_and_b, input_ids_b_and_a, input_mask, segment_ids_a_and_b, segment_ids_b_and_a, sent1_valid_ids, sent2_valid_ids, sent1_wordpiece_length, sent2_wordpiece_length = batch
    with torch.no_grad():
        decoded_results = model(input_ids_a_and_b=input_ids_a_and_b, input_ids_b_and_a=input_ids_b_and_a,
                                    attention_mask=input_mask, token_type_ids_a_and_b=segment_ids_a_and_b,
                                    token_type_ids_b_and_a=segment_ids_b_and_a,
                                    sent1_valid_ids=sent1_valid_ids, sent2_valid_ids=sent2_valid_ids,
                                    sent1_wordpiece_length=sent1_wordpiece_length,
                                    sent2_wordpiece_length=sent2_wordpiece_length)
    align_id_pairs = list(decoded_results[0])
    #print(align_id_pairs)
    merged_sent2_align_id_pairs = merge_sent2_ids(align_id_pairs)
    merged_sent1_align_id_pairs = merge_sent1_ids(merged_sent2_align_id_pairs)
    merged_id_pairs = merge_align_ids_crossing(merged_sent1_align_id_pairs)
    align_word_pairs = ids_to_words(merged_id_pairs, tokenized_sources[step], tokenized_targets[step])
    print(merged_id_pairs, align_word_pairs)

Some weights of BertModel were not initialized from the model checkpoint at neural_jacana/spanbert_hf_base and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yamanaka.h.ac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed test examples 0/1
[([1, 2], [8]), ([3], [6, 7]), ([4], [0]), ([5], [1]), ([6], [2]), ([7], [3]), ([8], [4]), ([9], [9])] [(['Radar', 'observations'], ['radar']), (['indicate'], ['observed', 'by']), (['a'], ['A']), (['fairly'], ['mainly']), (['pure'], ['pure']), (['iron-nickel'], ['Iron-Nickel']), (['composition'], ['composition']), (['.'], ['.'])]


In [6]:
def edit_distance(sent1, sent2, max_id=4999):
    m = len(sent1)
    n = len(sent2)
    dp = [[0 for x in range(n+1)] for x in range(m+1)]
    for i in range(m+1):
        for j in range(n+1):
            if i == 0:
                dp[i][j] = j    # Min. operations = j
            elif j == 0:
                dp[i][j] = i    # Min. operations = i
            elif sent1[i-1].lower() == sent2[j-1].lower():
                dp[i][j] = dp[i-1][j-1]
            else:
                edit_candidates = np.array([
                    dp[i][j-1], # Insert
                    dp[i-1][j] # Remove
                    ])
                dp[i][j] = 1 + min(edit_candidates)
    return dp

def sent2edit(sent1, sent2):
    dp = edit_distance(sent1, sent2)
    edits = []
    pos = []
    m, n = len(sent1), len(sent2)
    while m != 0 or n != 0:
        curr = dp[m][n]
        if m==0: #have to insert all here
            while n>0:
                left = dp[1][n-1]
                edits.append(sent2[n-1])
                pos.append(left)
                n-=1
        elif n==0:
            while m>0:
                top = dp[m-1][n]
                edits.append('DEL')
                pos.append(top)
                m -=1
        else: # we didn't reach any special cases yet
            diag = dp[m-1][n-1]
            left = dp[m][n-1]
            top = dp[m-1][n]
            if sent2[n-1].lower() == sent1[m-1].lower(): # keep
                edits.append('KEEP')
                pos.append(diag)
                m -= 1
                n -= 1
            elif curr == top+1: # INSERT preferred before DEL
                edits.append('DEL')
                pos.append(top)  # (sent2[n-1])
                m -= 1
            else: #insert
                edits.append(sent2[n - 1])
                pos.append(left)  # (sent2[n-1])
                n -= 1
    edits = edits[::-1]
    return edits


def edit2sent(sent, edits, last=False):
    new_sent = []
    sent_pointer = 0 #counter the total of KEEP and DEL, then align with original sentence
    if len(edits) == 0 or len(sent) ==0: # edit_list empty, return original sent
        return sent
    for i, edit in enumerate(edits):
        if len(sent) > sent_pointer: #there are tokens left for editing
            if edit =="KEEP":
                new_sent.append(sent[sent_pointer])
                sent_pointer += 1
            elif edit =="DEL":
                sent_pointer += 1
            else: #insert the word in
                new_sent.append(edit)
    if sent_pointer < len(sent):
        for i in range(sent_pointer,len(sent)):
            new_sent.append(sent[i])
    return new_sent

In [10]:
#sent1 = "Military experts say the line between combat is getting blurry."
#sent2 = "Military experts say war is changing."
#sent1 = "According to Ledford, Northrop executives said they would build substantial parts of the bomber in Palmdale, creating about 1,500 jobs."
#sent2 = "According to Ledford, Northrop said they would build most of the bomber parts in Palmdale. It would create 1,500 jobs."
#sent1 = "Their eyes are quite small, and their visual acuity is poor."
#sent2 = "Their eyes are very small, and they do not see well."
sent1 = "In return, Rollo swore fealty to Charles, converted to Christianity, and undertook to defend the northern region of France against the Incursions of other Viking groups."
sent2 = "Rollo swore to be loyal to Charles, then he changed his religion to Christianity. Rollo protected northern France by fighting Viking invaders."
sent1 = "A fee is the price one pays as remuneration for services, especially the Honorarium paid to a doctor, lawyer, consultant, or other member of a learned profession."
sent2 = "A price one might pay for services is a called a fee."
sent1 = "Thereafter the county's administration was conducted at Duns or Lauder until Greenlaw became the county town in 1596."
sent2 = "After that, the county offices were at Duns or Lauder. In 1596 they moved to Greenlaw."
sent1 = "MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time."
sent2 = "Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb."
sent1 = "As the largest sub-region in Mesoamerica, it encompassed a vast and varied landscape, from the mountainous regions of the Sierra Madre to the semi-arid plains of northern Yucatán."
sent2 = "As the largest sub-region in Mesoamerica, it was a vast and varied landscape."
sent1 = 'The tongue is sticky because of the presence of glycoprotein-rich mucous, which both lubricates movement in and out of the snout and helps to catch ants and termites, which adhere to it.'
sent2 = 'The sticky tongue helps to catch bugs.'
sent1 = 'The polymer is most often epoxy, but other polymers, such as polyester, vinyl Ester or nylon, are also sometimes used.'
sent2 = 'The most popular polymer to use is epoxy.'
sent1 = 'Together they formed New Music Manchester, a group committed to contemporary music.'
sent2 = 'Both of the formed a group committed to contemporary music called new music Manchester.'
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))

A = edit_distance(sent1_tok, sent2_tok, max_id=4999)

In [32]:
sent1 = "The International fight League was an American mixed martial arts( Mma) promotion billed as the world's first Mma League."
sent2 = "The International fight League was billed as the world's first mixed martial arts (Mma) League."

In [23]:
sent1 = "Aside from this, Cameron has often worked in Christian-Themed productions, among them the Post-Rapture films left behind: the movie, left behind II: tribulation force, and left behind: world at war, in which he plays Cameron `` Buck'' Williams."
sent2 = 'Cameron has often worked in Christian-Themed productions, among them are left behind: the movie, left behind II: tribulation force, and left behind: world at war, in which he plays Cameron "Buck" Williams.'

In [5]:
sent1 = 'Characteristics Radar observations indicate a fairly pure iron-nickel composition.'
sent2 = 'A mainly pure Iron-Nickel composition was observed by radar.'

In [33]:
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))

In [36]:
B = sent2edit(sent1_tok, sent2_tok)
print(B)
print(edit2sent(sent1_tok,B))

['KEEP', 'KEEP', 'by', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'stop', 'DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'from', 'working', '.', 'later', 'DEL', 'DEL', 'KEEP', 'KEEP', 'DEL', 'KEEP', 'KEEP', 'an', 'event', 'from', 'DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'KEEP', '.', 'as', 'a', 'result', ',', 'he', 'runs', 'DEL', 'DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'KEEP', 'to', 'stop', 'the', 'bomb', 'KEEP']
['MacGruber', 'starts', 'by', 'asking', 'for', 'simple', 'objects', 'to', 'stop', 'the', 'bomb', 'from', 'working', '.', 'later', 'he', 'is', 'distracted', 'by', 'an', 'event', 'from', 'his', 'personal', 'life', '.', 'as', 'a', 'result', ',', 'he', 'runs', 'out', 'of', 'time', 'to', 'stop', 'the', 'bomb', '.']


In [3]:
def extract_ad_spans(edits):
    ad_spans = []
    seen_a = [0 for i in range(len(edits))]
    for i in range(len(edits) - 1):
        if seen_a[i] != 1:
            if edits[i] != 'KEEP' and edits[i] != 'DEL':
                start = i
                j = i + 1
                flag = False
                while j < len(edits):
                    if edits[j] == 'DEL':
                        j += 1
                        flag = True
                    elif edits[j] != 'KEEP' and edits[j] != 'DEL':
                        if flag == False:
                            seen_a[j] = 1
                            j += 1
                        else:
                            break
                    else:
                        break
                if flag == True:
                    end = j - 1
                    if end - start > 0:
                        ad_spans.append((start, end))
    return ad_spans

In [4]:
def extract_d_starts_from_ad_spans(edits, ad_spans):
    d_starts = []
    for span in ad_spans:
        a_start = span[0]
        d_start = a_start
        while d_start < len(edits):
            if edits[d_start] != 'DEL':
                d_start += 1
            else:
                break
        d_starts.append(d_start)
    return d_starts

In [37]:
extract_ad_spans(B)

[(8, 12), (15, 20), (26, 32), (36, 47)]

In [38]:
extract_d_starts(B, extract_ad_spans(B))

[9, 19, 29, 43]

In [30]:
def extract_d_ids(edits):
    d_ids = []
    ad_spans = extract_ad_spans(edits)
    d_start = extract_d_starts(edits, ad_spans)

    cnt = 0
    for i, j in zip(ad_spans, d_start):
        d_ids.append((list(range(j, i[1]+1)), cnt))
        cnt += 1
    return d_ids

In [11]:
sent1 = "MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time."
sent2 = "Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb."
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))
edits = sent2edit(sent1_tok, sent2_tok)
ad_spans = extract_ad_spans(edits)
aligns = [([21, 22], [21]), ([0], [0]), ([1], [1, 2]), ([2], [3]), ([3], [4]), ([4], [5]), ([5], [6]), ([6], [7]), ([10], [8]), ([11], [37]), ([12], [38]), ([13], [13]), ([15], [15]), ([16], [16]), ([17], [14]), ([18], [17]), ([19], [18]), ([20], [19, 20]), ([23], [22]), ([24], [23]), ([25], [24]), ([28], [30]), ([29], [31]), ([30], [32]), ([31], [33]), ([32], [34]), ([33], [39])] 

In [12]:
print(sent1_tok)
print(sent2_tok)

['MacGruber', 'starts', 'asking', 'for', 'simple', 'objects', 'to', 'make', 'something', 'to', 'defuse', 'the', 'bomb', ',', 'but', 'he', 'is', 'later', 'distracted', 'by', 'something', 'usually', 'involving', 'his', 'personal', 'life', 'that', 'makes', 'him', 'run', 'out', 'of', 'time', '.']
['Macgruber', 'starts', 'by', 'asking', 'for', 'simple', 'objects', 'to', 'stop', 'the', 'bomb', 'from', 'working', '.', 'later', 'he', 'is', 'distracted', 'by', 'an', 'event', 'from', 'his', 'personal', 'life', '.', 'as', 'a', 'result', ',', 'he', 'runs', 'out', 'of', 'time', 'to', 'stop', 'the', 'bomb', '.']


In [13]:
ad_spans = extract_ad_spans(edits)

In [14]:
def assign_ids_to_edits(edits, sent2_tok):
    edits_ids = []
    sent1_pointer = 0
    sent2_pointer = 0
    for i in range(len(edits)):
        if edits[i] == 'KEEP':
            edits_ids.append(sent1_pointer)
            sent1_pointer += 1
        elif edits[i] == 'DEL':
            edits_ids.append(sent1_pointer)
            sent1_pointer += 1
        else:
            while sent2_pointer < len(sent2_tok):
                if sent2_tok[sent2_pointer] == edits[i]:
                    edits_ids.append(sent2_pointer)
                    sent2_pointer += 1
                    break
                else:
                    sent2_pointer += 1
    return edits_ids

In [15]:
print(edits)
print(assign_ids_to_edits(edits, sent2_tok))

['KEEP', 'KEEP', 'by', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'stop', 'DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'from', 'working', '.', 'later', 'DEL', 'DEL', 'KEEP', 'KEEP', 'DEL', 'KEEP', 'KEEP', 'an', 'event', 'from', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'KEEP', '.', 'as', 'a', 'result', ',', 'he', 'runs', 'DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'KEEP', 'KEEP', 'to', 'stop', 'the', 'bomb', 'KEEP']
[0, 1, 2, 2, 3, 4, 5, 6, 8, 7, 8, 9, 10, 11, 12, 11, 12, 13, 14, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 20, 21, 22, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 26, 27, 28, 29, 30, 31, 32, 35, 36, 37, 38, 33]


In [16]:
sent1 = "MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time."
sent2 = "Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb."
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))
edits = sent2edit(sent1_tok, sent2_tok)
ad_spans = extract_ad_spans(edits)
aligns = [([21, 22], [21]), ([0], [0]), ([1], [1, 2]), ([2], [3]), ([3], [4]), ([4], [5]), ([5], [6]), ([6], [7]), ([10], [8]), ([11], [37]), ([12], [38]), ([13], [13]), ([15], [15]), ([16], [16]), ([17], [14]), ([18], [17]), ([19], [18]), ([20], [19, 20]), ([23], [22]), ([24], [23]), ([25], [24]), ([28], [30]), ([29], [31]), ([30], [32]), ([31], [33]), ([32], [34]), ([33], [39])] 

In [83]:
print(len(edits))
print(len(assign_ids_to_edits(edits, sent2_tok)))

54
54


In [17]:
ad_spans

[(8, 12), (15, 20), (26, 31), (35, 45)]

In [18]:
def extract_splr_ids(edits, ad_spans, sent2_tok):
    edits_ids = assign_ids_to_edits(edits, sent2_tok)
    splr_ids = []
    for ad_span_idx in range(len(ad_spans)):
        splr_flag = False
        for i in range(ad_spans[ad_span_idx][0], ad_spans[ad_span_idx][1] + 1):
            if edits[i] == '.':
                splr_flag = True
        if splr_flag == True:
            sent1_span = [edits_ids[j] for j in range(ad_spans[ad_span_idx][0], ad_spans[ad_span_idx][1]+1) if edits[j] == 'KEEP' or edits[j] == 'DEL']
            sent2_span = [edits_ids[j] for j in range(ad_spans[ad_span_idx][0], ad_spans[ad_span_idx][1]+1) if edits[j] != 'KEEP' and edits[j] != 'DEL']
            splr_ids.append((sent1_span, sent2_span, ad_span_idx))
    return splr_ids

In [23]:
splr_ids = extract_splr_ids(edits, ad_spans, sent2_tok)
print(splr_ids)

[([13, 14], [11, 12, 13, 14], 1), ([26, 27, 28, 29], [25, 26, 27, 28, 29, 30, 31], 3)]


In [24]:
ad_spans_done = [i[2] for i in splr_ids]
print(ad_spans_done)

[1, 3]


In [87]:
ad_spans

[(8, 12), (15, 20), (26, 31), (35, 45)]

In [88]:
d_starts = extract_d_starts_from_ad_spans(edits, ad_spans)
print(d_starts)

[9, 19, 29, 42]


In [91]:
sent1_tok[9:13]

['to', 'defuse', 'the', 'bomb']

In [93]:
edits[8:13]

['stop', 'DEL', 'DEL', 'DEL', 'DEL']

In [19]:
sent1 = "MacGruber starts asking for simple objects to make something to defuse the bomb, but he is later distracted by something (usually involving his personal life) that makes him run out of time."
sent2 = "Macgruber starts by asking for simple objects to stop the bomb from working. later he is distracted by an event from his personal life. as a result, he runs out of time to stop the bomb."
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))
edits = sent2edit(sent1_tok, sent2_tok)
ad_spans = extract_ad_spans(edits)
aligns = [([21, 22], [21]), ([0], [0]), ([1], [1, 2]), ([2], [3]), ([3], [4]), ([4], [5]), ([5], [6]), ([6], [7]), ([10], [8]), ([11], [37]), ([12], [38]), ([13], [13]), ([15], [15]), ([16], [16]), ([17], [14]), ([18], [17]), ([19], [18]), ([20], [19, 20]), ([23], [22]), ([24], [23]), ([25], [24]), ([28], [30]), ([29], [31]), ([30], [32]), ([31], [33]), ([32], [34]), ([33], [39])] 

In [20]:
def extract_rep_ids(edits, ad_spans, sent2_tok, aligns, ad_spans_done):
    edits_ids = assign_ids_to_edits(edits, sent2_tok)
    d_starts = extract_d_starts_from_ad_spans(edits, ad_spans)
    rep_ids = []
    for ad_span_idx in range(len(ad_spans)):
        if ad_span_idx in ad_spans_done:
            continue
        else:
            now_span = ad_spans[ad_span_idx]
            d_start_in_now_span = d_starts[ad_span_idx]
            d_span_in_now_span = list(range(d_start_in_now_span, now_span[1]+1))
            a_span_in_now_span = list(range(now_span[0], d_start_in_now_span))
            sent1_ids_corresponding_d_span_in_now_span = [edits_ids[i] for i in d_span_in_now_span if edits[i] == 'DEL']
            sent2_ids_corresponding_a_span_in_now_span = [edits_ids[i] for i in a_span_in_now_span if edits[i] != 'DEL']

            added_words_to_sent1_by_a_span = []
            for i in range(len(sent2_ids_corresponding_a_span_in_now_span)):
                added_words_to_sent1_by_a_span.append(sent2_tok[sent2_ids_corresponding_a_span_in_now_span[i]])

            aligned_words_in_sent2 = []
            for i in range(len(sent1_ids_corresponding_d_span_in_now_span)):
                for j in range(len(aligns)):
                    if sent1_ids_corresponding_d_span_in_now_span[i] in aligns[j][0]:
                        for aligned_word_id in aligns[j][1]:
                            aligned_words_in_sent2.append(sent2_tok[aligned_word_id])

            rep_flag = False
            for word in aligned_words_in_sent2:
                if word in added_words_to_sent1_by_a_span:
                    rep_flag = True
            
            if rep_flag == True:
                rep_ids.append((sent1_ids_corresponding_d_span_in_now_span, sent2_ids_corresponding_a_span_in_now_span, ad_span_idx))
    
    return rep_ids

In [25]:
rep_ids = extract_rep_ids(edits, ad_spans, sent2_tok, aligns, ad_spans_done)
print(rep_ids)

[([7, 8, 9, 10], [8], 0), ([20, 21, 22], [19, 20, 21], 2)]


In [108]:
print(sent1_tok[7:11], sent2_tok[8:9])

['make', 'something', 'to', 'defuse'] ['stop']


In [109]:
print(sent1_tok[20:23], sent2_tok[19:22])

['something', 'usually', 'involving'] ['an', 'event', 'from']


In [26]:
test_d_span = [9, 10, 11, 12]
test_d_span = [29, 30, 31]
edits_ids = assign_ids_to_edits(edits, sent2_tok)
sent1_span = [edits_ids[i] for i in test_d_span if edits[i] == 'DEL']
aligned_words_in_sent2 = []
for i in range(len(sent1_span)):
    for j in range(len(aligns)):
        if sent1_span[i] in aligns[j][0]:
            for aligned_word_id in aligns[j][1]:
                aligned_words_in_sent2.append(sent2_tok[aligned_word_id])
print(aligned_words_in_sent2)

['an', 'event', 'from', 'from']


In [32]:
sent1 = 'Characteristics Radar observations indicate a fairly pure iron-nickel composition.'
sent2 = 'A mainly pure Iron-Nickel composition was observed by radar.'
sent1_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent1))
sent2_tok = nltk.word_tokenize(re.sub(r'[\(\)\`\'\"]', '', sent2))
edits = sent2edit(sent1_tok, sent2_tok)
ad_spans = extract_ad_spans(edits)
aligns = [([1, 2], [8]), ([3], [6, 7]), ([4], [0]), ([5], [1]), ([6], [2]), ([7], [3]), ([8], [4]), ([9], [9])]

In [33]:
print(edits)
print(ad_spans)

['DEL', 'DEL', 'DEL', 'DEL', 'KEEP', 'mainly', 'DEL', 'KEEP', 'KEEP', 'KEEP', 'was', 'observed', 'by', 'radar', 'KEEP']
[(5, 6)]


In [29]:
def extract_d_spans(edits):
    d_spans = []
    flag = False
    seen = [0 for i in range(len(edits))]
    for i in range(len(edits)):
        if seen[i] != 1:
            seen[i] = 1
            if edits[i] != 'KEEP' and edits[i] != 'DEL':
                flag = True
            elif edits[i] == 'KEEP':
                flag = False
            else:
                if flag == True:
                    continue
                else:
                    start = i
                    j = i + 1
                    while j < len(edits):
                        if edits[j] == 'DEL':
                            seen[j] = 1
                            j += 1
                        elif edits[j] == 'KEEP':
                            break
                        else:
                            flag = True
                            break
                    end = j - 1
                    if end - start > 0:
                        d_spans.append((start, end))
    return d_spans        

In [34]:
extract_d_spans(edits)

[(0, 3)]

In [43]:
def extract_a_spans(edits):
    a_spans = []
    seen_a = [0 for i in range(len(edits))]
    for i in range(len(edits)):
        if seen_a[i] != 1:
            seen_a[i] = 1
            if edits[i] != 'KEEP' and edits[i] != 'DEL':
                start = i
                j = i + 1
                while j < len(edits):
                    flag = False
                    if edits[j] != 'KEEP' and edits[j] != 'DEL':
                        seen_a[j] = 1
                        j += 1
                    elif edits[j] == 'DEL':
                        flag = True
                        break
                    else:
                        break
                end = j - 1
                if (flag == False) and (end - start >= 0):
                    a_spans.append((start, end))
    return a_spans

In [44]:
extract_a_spans(edits)

[(10, 13)]

In [None]:
def extract_mvr_ids(edits, sent2_tok, aligns):
    edits_ids = assign_ids_to_edits(edits, sent2_tok)

In [None]:
def insert_rep_span(edits, ad_spans, aligns, sent1_tok, sent2_tok, slp_spans):
    d_starts = extract_d_starts(edits, ad_spans)
    aligns_sent1_ids = [i[0] for i in aligns]
    aligns_sent2_ids = [i[1] for i in aligns]
    for span, d_start in zip(ad_spans, d_starts):
        d_seen = [0 for i in range(len(d_start))]
        a_wordlist = [sent2_tok[i] for i in range(span[0], d_start)]
        edit_to_inserts = []
        #d_pointer = d_start
        for d_pointer in range(d_start, span[1]):
            for i in range(len(aligns_sent1_ids)):
                if d_pointer in aligns_sent1_ids[i] and d_seen[d_pointer - d_start] != 0:
                    d_seen[d_pointer - d_start] = 1
                    sent2_ids = aligns_sent2_ids[i]
                    sent2_words = [sent2_tok[i] for i in sent2_ids]
                    edit_to_insert = ['REP-S']
                    for sent2_word in sent2_words:
                        if sent2_word in a_wordlist:
                            edit_to_insert.append(sent2_word)
                    edit_to_insert.append('REP-E')
                    if len(edit_to_insert) > 2:
                        edit_to_inserts.append(edit_to_insert)


        for i in range(len(aligns_sent1_ids)):
            if d_pointer in aligns_sent1_ids[i] and d_seen[d_pointer - d_start] != 0:
                d_seen[d_pointer - d_start] = 1
                sent2_ids = aligns_sent2_ids[i]
                sent2_words = [sent2_tok[i] for i in sent2_ids]
                edit_to_insert = ['REP-S']
                for sent2_word in sent2_words:
                    if sent2_word in a_wordlist:
                        edit_to_insert.append(sent2_word)
                edit_to_insert.append('REP-E')
                if len(edit_to_insert) > 2:
                    edit_to_inserts.append(edit_to_insert)
            d_pointer += 1
            