In [1]:
from collections import Counter
from collections import defaultdict as dd
import copy
import unicodecsv as csv
import numpy as np

In [2]:
class Hyperparameters:
    """
    Simple getter-and-setter class for storing values of hyperparameters
    note: p_S = p_{\$} while p_s is really that.
    """
    def __init__(self,p_S,b,p_phi,alpha,p_s):
        self.p_S = p_S
        self.b = b
        self.p_phi = p_phi
        self.alpha = alpha
        self.p_s = p_s
        
    def get_p_S(self):
        return self.p_S
    def get_b(self):
        return self.b
    def get_p_phi(self):
        return self.p_phi
    def get_alpha(self):
        return self.alpha
    def get_p_s(self):
        return self.p_s

    def set_p_S(self,new):
        self.p_S = new
    def set_b(self,new):
        self.b = new
    def set_p_phi(self,new):
        self.p_phi = new
    def set_alpha(self,new):
        self.alpha = new
    def set_p_s(self,new):
        self.p_s = new

In [16]:
# To use later to improve code - make each sentence pair an object, with easy ways to modify
# phrases and alignments
class SentencePair:
    def __init__(self,ind,e_sent,f_sent,phrase_pair_set):
        def build_ph_align_objs(phrase_pair_set):
            """
            Assumes pairs in the form of tuples of (e_phrase, f_phrase) = 
            ( ((0,"Michael"),(1,"assumes")), ((0,"Michael"),(1,"geht"),(1,"davon"),(1,"aus")) )
            Note that one (or both) sides of the pair could be None, signifying
            an unaligned phrase. No None entries are added to the list of phrases.
            """
            phrases = dd(set)
            alignment = set()
            aligned = dd(set)
            bounds = dd(set)
            for pair in phrase_pair_set:
                if pair[0]:
                    phrases["e"].add(pair[0])
                    bounds["e"].add(pair[0][0][0])
                if pair[1]:
                    phrases["f"].add(pair[1])
                    bounds["f"].add(pair[1][0][0])
                if pair[0] and pair[1]:
                    alignment.add((pair[0],pair[1]))
                    for (ind, word) in pair[0]:
                        aligned["e"].add(ind)
                    for (ind, word) in pair[1]:
                        aligned["f"].add(ind)
            #print(aligned["f"])
            return phrases, alignment, aligned, bounds

        def build_phrase_map(self):
            phrase_map = {"e":dict(), "f":dict()}
            for lg in set(["e","f"]):
                for phrase in self.phrases[lg]:
                    for (ind, word) in phrase:
                        phrase_map[lg][ind] = phrase
            return phrase_map
        
        def build_alignment_dict(self):
            alignment_dict = {"e-f":dict(), "f-e":dict()}
            for pair in self.alignment:
                alignment_dict["e-f"][pair[0]] = pair[1]
                alignment_dict["f-e"][pair[1]] = pair[0]
            return alignment_dict
        
        self.ind = ind                # the line number of the sentence in the parallel text
        self.e_sent = e_sent          # text of english ("target") sentence
        self.f_sent = f_sent          # text of foreign ("source") sentence
        self.phrase_pair_set = phrase_pair_set
        self.phrases, self.alignment, self.aligned, self.bounds = build_ph_align_objs(phrase_pair_set)
        self.phrase_map = build_phrase_map(self)
        self.alignment_dict = build_alignment_dict(self)
    
    def build_phrase_map(self):
        phrase_map = {"e":dict(), "f":dict()}
        for lg in set(["e","f"]):
            for phrase in self.phrases[lg]:
                for (ind, word) in phrase:
                    phrase_map[lg][ind] = phrase
        return phrase_map
    
    def get_phrase_map(self):
        return self.phrase_map
    
    def update_phrase_map(self):
        self.phrase_map = self.build_phrase_map()
    
    def query_phrase_map(self,lg,index):
        return self.phrase_map[lg][index]
    
    def build_alignment_dict(self):
        aligned = dd(set)
        alignment_dict = {"e-f":dict(), "f-e":dict()}
        for pair in self.alignment:
            alignment_dict["e-f"][pair[0]] = pair[1]
            alignment_dict["f-e"][pair[1]] = pair[0]
            for (ind, word) in pair[0]:
                aligned["e"].add(ind)
            for (ind, word) in pair[1]:
                aligned["f"].add(ind)
        return alignment_dict, aligned
    
    def update_alignment_dict(self):
        self.alignment_dict, self.aligned = self.build_alignment_dict()
        
    def query_alignment_dict(self,direction,phrase):
        """
        direction must be either "e-f" or "f-e"
        """
        if phrase in self.alignment_dict[direction]:
            return self.alignment_dict[direction][phrase]
        else:
            return None
    
    def get_ind(self):
        return self.ind
    
    def get_e_sent(self):
        return self.e_sent
    
    def get_f_sent(self):
        return self.f_sent
    
    def get_e_phrases(self):
        return self.phrases["e"]
    
    def get_f_phrases(self):
        return self.phrases["f"]
        
    def get_aligned(self):
        return self.aligned
    
    def get_alignments(self):
        return self.alignment
    
    def remove_alignment(self,pair):
        """
        pair must be a tuple of the shape (e_phrase,f_phrase)
        """
        self.alignment.remove(pair)
        self.update_alignment_dict()
        #for (ind,word) in pair[0]:
        #    self.aligned["e"].remove(ind)
        #for (ind,word) in pair[1]:
        #    self.aligned["f"].remove(ind)
    
    def add_alignment(self,pair):
        """
        pair must be a tuple of the shape (e_phrase,f_phrase)
        """
        self.alignment.add(pair)
        self.update_alignment_dict()
        #for (ind,word) in pair[0]:
        #    self.aligned["e"].add(ind)
        #for (ind,word) in pair[1]:
        #    self.aligned["f"].add(ind)
    
    def query_boundary(self,lg,bound):
        """
        lg should be "e" or "f"
        """
        if bound in self.bounds[lg]:
            return True
        else:
            return False
    
    def remove_boundary(self,lg,bound):
        """
        lg must be either the string "e" or "f" (english or foreign)
        bound must be an integer
        """
        self.bounds[lg].remove(bound)
        first_phrase = None
        second_phrase = None
        for phrase in self.phrases[lg]:
            for (ind,word) in phrase:
                if ind == bound-1:
                    first_phrase = phrase
                elif ind == bound:
                    second_phrase = phrase
                if first_phrase and second_phrase:
                    break
        self.phrases[lg].remove(first_phrase)
        self.phrases[lg].remove(second_phrase)
        new_phrase = (*first_phrase,*second_phrase)
        self.phrases[lg].add(new_phrase)
        self.update_phrase_map()
        
    def add_boundary(self,lg,bound):
        """
        lg must be either the string "e" or "f" (english or foreign)
        bound must be an integer
        """
        self.bounds[lg].add(bound)
        for phrase in self.phrases[lg]:
            for (ind, word) in phrase:
                if ind == bound:
                    target_phrase = phrase
                    break
        first_phrase = list()
        second_phrase = list()
        for (ind, word) in target_phrase:
            if ind < bound:
                first_phrase.append((ind, word))
            else:   # elif ind >= bound
                second_phrase.append((ind, word))
        self.phrases[lg].remove(target_phrase)
        self.phrases[lg].add(tuple(first_phrase))
        self.phrases[lg].add(tuple(second_phrase))
        self.update_phrase_map()

In [28]:
class TextLevelMutableObjects:
    """
    There should only be one of these objects, and we typically name it TLMO in code that follows.
    """
    def __init__(self, vocab=dd(set), pp_counts=Counter(), P_WA=dd(dict), P_WA_table=dd(dict), lowest_WA_prob=1.0, sent_pair_dict=dict()):
        self.vocab = vocab
        self.pp_counts = pp_counts              # pp_counts = phrase pair counts
        self.P_WA = P_WA
        self.lowest_WA_prob = lowest_WA_prob    # initialize at highest possible probability to eventually decrease it
        self.sent_pair_dict = sent_pair_dict
        self.P_WA_table = P_WA_table
    
    def get_n_e(self):
        return len(self.vocab["e"])
    
    def get_n_f(self):
        return len(self.vocab["f"])
    
    def get_vocab(self):
        return self.vocab
    
    def vocab_add(self,lg,word):
        """
        lg must be "e" or "f", though this isn't enforced right now.
        No reason to have a counterpart remove method right now...
        """
        self.vocab[lg].add(word)
        
    def init_vocab_from_text(self,lg,text):
        for word in text.split():
            self.vocab[lg].add(word)
    
    def get_pp_counts(self):
        return self.pp_counts
        
    def decrement_pp_counts(self,phrase_pair):
        """
        This if-statement is a kludgy way to guard against an improper balance of incrementing
        and decrementing phrase counts. Need to closely examine how this is done in the Gibbs 
        sampling operations!
        """
        if self.pp_counts[phrase_pair] > 0:
            self.pp_counts[phrase_pair] -= 1
    
    def increment_pp_counts(self,phrase_pair):
        self.pp_counts[phrase_pair] += 1
    
    def get_pp_count(self,phrase_pair):
        if phrase_pair in self.pp_counts:
            return self.pp_counts[phrase_pair]
        else:
            #return None
            return 0
    
    def init_pp_counts_from_pair_set(self,phrase_pair_set):
        """

        """
        for pair in phrase_pair_set:
            if pair[0] and pair[1]:
                self.pp_counts[pair] += 1
                for (ind, word) in pair[0]:
                    self.vocab["e"].add(word)
                for (ind, word) in pair[1]:
                    self.vocab["f"].add(word)
    
    def get_lowest_WA_prob(self):
        return self.lowest_WA_prob
    
    def set_lowest_WA_prob(self,new_val):
        self.lowest_WA_prob = new_val
        
    def get_P_WA(self,word1,word2):
        if word1 in self.P_WA:
            if word2 in self.P_WA[word1]:
                return self.P_WA[word1][word2]
            else:
                return None
        else:
            return None
    
    def set_P_WA(self,word1,word2,prob):
        self.P_WA[word1][word2] = prob
    
    def get_P_WA_table(self,phrase1,phrase2):
        if phrase1 in self.P_WA_table:
            if phrase2 in self.P_WA_table[phrase1]:
                return self.P_WA_table[phrase1][phrase2]
            else:
                return None
        else:
            return None

    def set_P_WA_table(self,phrase1,phrase2,prob):
        self.P_WA_table[phrase1][phrase2] = prob
    
    def set_P_WA(self,word1,word2,prob):
        self.P_WA[word1][word2] = prob
        
    def get_sent_pair_dict(self):
        return self.sent_pair_dict
    
    def get_sent_pair(self,index):
        return self.sent_pair_dict[index]
    
    def set_sent_pair(self,index,SentPair):
        """
        SentPair should be a formal SentPair object
        """
        self.sent_pair_dict[index] = SentPair

In [5]:
def process_IBM_Model_1_data(src_vocab_file,trg_vocab_file,src_trg_alignment_file,trg_src_alignment_file,TLMO):
    """
    Takes the output of IBM Model 1 alignments from GIZA++ and creates a lookup
    table for the translation probabilities of f|e and e|f.
    Here, src = foreign (f), trg = english (e)
    Returns a lookup dictionary of the form P_WA[word1][word2] = alignment probability of word1|word2
    # Is this correct given how GIZA++ does things?
    TLMO = TextLevelMutableObject for holding P_WA, lowest_WA_prob
    """
    src_lut = {}
    trg_lut = {}
    with open(trg_vocab_file,"rb") as tvf:
        tv = csv.reader(tvf,delimiter=" ",encoding="utf-8")
        for line in tv:
            if len(line) == 3:
                trg_lut[int(line[0])] = line[1]
                TLMO.vocab_add("e",line[1])
    with open(src_vocab_file,"rb") as svf:
        sv = csv.reader(svf,delimiter=" ",encoding="utf-8")
        for line in sv:
            if len(line) == 3:
                src_lut[int(line[0])] = line[1]
                TLMO.vocab_add("f",line[1])
    with open(trg_src_alignment_file,"rb") as tsaf:
        tsa = csv.reader(tsaf,delimiter=" ",encoding="utf-8")
        for line in tsa:
            if len(line) == 3:
                if int(line[0]) in trg_lut and int(line[1]) in src_lut:
                    word1 = trg_lut[int(line[0])]
                    word2 = src_lut[int(line[1])]
                    prob = float(line[2])
                    TLMO.set_P_WA(word1,word2,prob)
                    if prob < TLMO.get_lowest_WA_prob():
                        TLMO.set_lowest_WA_prob(prob)
    with open(src_trg_alignment_file,"rb") as staf:
        sta = csv.reader(staf,delimiter=" ",encoding="utf-8")
        for line in sta:
            if len(line) == 3:
                if int(line[0]) in src_lut and int(line[1]) in trg_lut:
                    word1 = src_lut[int(line[0])]
                    word2 = trg_lut[int(line[1])]
                    prob = float(line[2])
                    TLMO.set_P_WA(word1,word2,prob)
                    if prob < TLMO.get_lowest_WA_prob():
                        TLMO.set_lowest_WA_prob(prob)
    return TLMO

In [6]:
def initialize_tlmo(srctext_file,trgtext_file,alignments_file,TLMO):
    """
    For now, these are the input files to the Berkeley aligner, as well as 
    the output training.align file for alignments_file.
    Assumes that the two text input files have the same number of lines, and that these all have alignment output.
    Assume src = Foreign (f) and trg = English (e)
    """
    def extract_phrases(srctext,trgtext,alignment_output_line):
        """
        Helper Function:
        Extracts phrases from the alignment output for a single sentence.
        For unaligned phrases, creates a phrase pair where null side is None.
        """
        srclen = len(srctext.split())
        trglen = len(trgtext.split())
        srctext_lst = srctext.split()
        trgtext_lst = trgtext.split()
        src_dict = {}
        trg_dict = {}
        for i, word in enumerate(srctext_lst):
            src_dict[i] = word
        for i, word in enumerate(trgtext_lst):
            trg_dict[i] = word
        f_aligned = set()
        e_aligned = set()
        line_lst = alignment_output_line.split()
        alignment = list()
        for pair in line_lst:
            [f, e] = pair.split("-")
            f_aligned.add(int(f))
            e_aligned.add(int(e))
            alignment.append((int(f),int(e)))
        alignment = sorted(alignment)
        
        phrase_set = set()
        phrase = set()
        prev_e = 0
        prev_f = 0
        for f,e in alignment:
            if abs(e - prev_e) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            elif abs(f - prev_f) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            else:
                phrase.add((e,f))
            prev_e = e
            prev_f = f
        phrase_set.add(frozenset(phrase))
        
        full_phrases = []
        for i in sorted(phrase_set):
            if len(i) > 0:
                phrase_e = set()
                phrase_f = set()
                for pair in sorted(i):
                    phrase_e.add((pair[0],trg_dict[pair[0]]))
                    phrase_f.add((pair[1],src_dict[pair[1]]))
                phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
                full_phrases.append(phrase_pair)
        
        unaligned_phrases = []
        unaligned_phrase = set()
        for i, word in enumerate(srctext_lst):
            if i not in f_aligned:
                unaligned_phrase.add((i,src_dict[i]))
            else:
                if len(unaligned_phrase) != 0:
                    unaligned_phrases.append([None,sorted(unaligned_phrase)])
                    unaligned_phrase = set()
        if unaligned_phrase != set():
            unaligned_phrases.append([None,sorted(unaligned_phrase)])
            unaligned_phrase = set()
        for i, word in enumerate(trgtext_lst):
            if i not in e_aligned:
                unaligned_phrase.add((i,trg_dict[i]))
            else:
                if len(unaligned_phrase) != 0:
                    unaligned_phrases.append([sorted(unaligned_phrase),None])
                    unaligned_phrase = set()
        if unaligned_phrase != set():
            unaligned_phrases.append([sorted(unaligned_phrase),None])
            unaligned_phrase = set()
        
        full_phrases.extend(unaligned_phrases)
        
        full_phrase_set = set()
        for i in full_phrases:
            new_e = tuple(i[0]) if i[0] else None
            new_f = tuple(i[1]) if i[1] else None
            new_phrase = (new_e, new_f)
            full_phrase_set.add(new_phrase)
        
        return full_phrase_set
        # end function extract_phrases
    
    # Main code within function initialize_tlmo()
    with open(srctext_file,"r",encoding="utf-8") as srctext_f, \
    open(trgtext_file,"r",encoding="utf-8") as trgtext_f, \
    open(alignments_file,"r") as alignments_f:
        for i, (sline,tline,aline) in enumerate(zip(srctext_f,trgtext_f,alignments_f)):
            src = sline.strip(" \n\r")
            trg = tline.strip(" \n\r")
            A = aline.strip(" \n\r")
            full_phrase_set = extract_phrases(src,trg,A)
            sent_pair = SentencePair(i,trg,src,full_phrase_set)
            TLMO.init_vocab_from_text("e",trg)
            TLMO.init_vocab_from_text("f",src)
            TLMO.init_pp_counts_from_pair_set(sent_pair.get_alignments())
            TLMO.set_sent_pair(i,sent_pair)
    
    return TLMO

In [73]:
def collapsed_gibbs_aligner(srctext_file,trgtext_file,TLMO,hp):
    """
    Will want to repeatedly call this for N iterations.
    """
    # Supporting functions:
    def p_f(src_phrase,n_f,hp):
        return (hp.get_p_s()*(1-hp.get_p_s())**(len(src_phrase)))*((1/n_f)**(len(src_phrase)))
    
    def p_e(trg_phrase,n_e,hp):
        return (hp.get_p_s()*(1-hp.get_p_s())**(len(trg_phrase)))*((1/n_e)**(len(trg_phrase)))
    
    def calc_P_WA(trg_phrase, src_phrase, TLMO):
        """
        p(e|f)
        src_phrase = conditioning phrase = f
        trg_phrase = target phrase = e
        """
        pre_computed_value = TLMO.get_P_WA_table(trg_phrase, src_phrase)
        if not pre_computed_value:
            lowest_prob = TLMO.get_lowest_WA_prob()
            P_WA_value = 1.0
            for i, (ind_i,word_i) in enumerate(trg_phrase):
                i_prob = 1.0
                for j, (ind_j,word_j) in enumerate(src_phrase):
                    p_i_given_j = TLMO.get_P_WA(word_j,word_i)
                    if p_i_given_j:
                        i_prob *= p_i_given_j
                    else:
                        i_prob = lowest_prob
                P_WA_value *= i_prob
            TLMO.set_P_WA_table(trg_phrase, src_phrase, P_WA_value)
            return P_WA_value
        else:
            return pre_computed_value
    
    def delta(trg_phrase,src_phrase,tline,sline,hp):
        """
        
        """
        s = float(len(tline))/float(len(sline))
        delta = hp.get_b()**abs(trg_phrase[0][0]-(src_phrase[0][0]*s))
        return delta
    
    def tau(trg_phrase,src_phrase,tline,sline,sent_pair,TLMO,hp):
        tau_value = (TLMO.get_pp_count((trg_phrase,src_phrase)) + \
                     (hp.get_alpha() * ((p_f(src_phrase,TLMO.get_n_f(),hp) * \
                              calc_P_WA(trg_phrase, src_phrase, TLMO)) * \
                             (p_e(trg_phrase,TLMO.get_n_e(),hp) * \
                             calc_P_WA(src_phrase, trg_phrase, TLMO))))) / \
        (len(TLMO.get_pp_counts()) + hp.get_alpha())
        return tau_value
    
    def theta_N(trg_phrase,src_phrase,TLMO,hp):
        if trg_phrase and not src_phrase:
            theta_N_value = 0.5*p_e(trg_phrase,TLMO.get_n_e(),hp)
        elif src_phrase and not trg_phrase:
            theta_N_value = 0.5*p_f(src_phrase,TLMO.get_n_f(),hp)
        return theta_N_value
        
    def potential(trg_phrase,src_phrase,tline,sline,TLMO,sent_pair,hp):
        """
        This calculates the potential function's value.
        Watch out! Need to be able to modify to accommodate Markov blanket...
        """
        if (trg_phrase and src_phrase):
            pot_value = (1-hp.get_p_S())*(1-hp.get_p_phi())*tau(trg_phrase,src_phrase,tline,sline,sent_pair,TLMO,hp)*delta(trg_phrase,src_phrase,tline,sline,hp)
        else:
            pot_value = (1-hp.get_p_S())*hp.get_p_phi()*theta_N(trg_phrase,src_phrase,TLMO,hp)
    
        return pot_value
    
    #def create_phrase_map(sline,tline,full_phrases):
    #    """
    #    full_phrases is the value of a key in the dictionary phrase_table, e.g. phrase_tables[0]
    #    to get the full_phrases of the first sentence.
    #    """
    #    src_phrase_map = {}
    #    trg_phrase_map = {}
    #    for phrase_ind, pair in enumerate(full_phrases):
    #        trg = pair[0]
    #        src = pair[1]
    #        if trg:
    #            for entry in trg:
    #                trg_phrase_map[entry[0]] = trg
    #        if src:
    #            for entry in src:
    #                src_phrase_map[entry[0]] = src
    #    for src_ind, word in enumerate(sline):
    #        if src_ind not in src_phrase_map:
    #            src_phrase_map[src_ind] = None
    #    for trg_ind, word in enumerate(tline):
    #        if trg_ind not in trg_phrase_map:
    #            trg_phrase_map[trg_ind] = None
    #    return (src_phrase_map, trg_phrase_map)
    
    def SWAP(sent_ind,e_phrase,f_phrase,e_phrase_2,f_phrase_2,tline,sline,sent_pair,TLMO,hp):
        """
        
        """
        # initialize scorekeeping dict
        pair_score = dict()
        # Set up configurations as tuples. Names of variables will be keys in dict.
        ident = (e_phrase,f_phrase,e_phrase_2,f_phrase_2)
        swapped = (e_phrase,f_phrase_2,e_phrase_2,f_phrase)
        out_conf_dict = {"ident":ident,"swapped":swapped}
        # First, modifying counts and alignments so that TLMO and sent_pair are the Markov blanket.
        TLMO.decrement_pp_counts((e_phrase,f_phrase))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase))
        sent_pair.remove_alignment((e_phrase,f_phrase))
        sent_pair.remove_alignment((e_phrase_2,f_phrase_2))
        # Do I need to decrement the counts for the swapped configuration?
        # Second, calculate potential scores.
        #     First, swapped configuration: increment phrase counts for it, 
        #     then decrement them to restore to Markov blanket
        TLMO.increment_pp_counts((e_phrase,f_phrase_2))
        TLMO.increment_pp_counts((e_phrase_2,f_phrase))
        pair_score["swapped"] = (potential(e_phrase,f_phrase_2,tline,sline,TLMO,sent_pair,hp),
                                 potential(e_phrase_2,f_phrase,tline,sline,TLMO,sent_pair,hp))
        TLMO.decrement_pp_counts((e_phrase,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase))
        #     Second, identify configuration: increment phrase_counts for identity, 
        #     then decrement them to restore to Markov blanket
        TLMO.increment_pp_counts((e_phrase,f_phrase))
        TLMO.increment_pp_counts((e_phrase_2,f_phrase_2))
        pair_score["ident"] = (potential(e_phrase,f_phrase,tline,sline,TLMO,sent_pair,hp),
                               potential(e_phrase_2,f_phrase_2,tline,sline,TLMO,sent_pair,hp))
        TLMO.decrement_pp_counts((e_phrase,f_phrase))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
        # calculate output configuration probabilities
        ident_prob = (pair_score["ident"][0]*pair_score["ident"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        swapped_prob = (pair_score["swapped"][0]*pair_score["swapped"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        #print(ident_prob+swapped_prob,ident_prob,swapped_prob)
        # choose one configuration probabilistically
        out_conf_decision = np.random.choice(["ident","swapped"],1,p=[ident_prob,swapped_prob])[0]
        out_conf = out_conf_dict[out_conf_decision]
        # update phrase_counts, phrase_table depending on out_conf, which is a tuple of (f,e,f,e) phrases
        TLMO.increment_pp_counts((out_conf[0],out_conf[1]))
        TLMO.increment_pp_counts((out_conf[2],out_conf[3]))
        # update alignments
        sent_pair.add_alignment((out_conf[0],out_conf[1]))
        sent_pair.add_alignment((out_conf[2],out_conf[3]))
        sent_pair.update_alignment_dict()
        TLMO.set_sent_pair(sent_ind,sent_pair)
        return (sent_pair, TLMO)
    
    def FLIP(lg,seg_ind,e_line,f_line,sent_pair,TLMO,hp):
        """
        There's some sort of bug with this where it's always assigning too high a probability to the "is_not_a_bound"
        condition regardless of where you start, and the resulting action takes out all the alignments!
        Bug both with probability mass assignment and with the behavior it causes...
        """
        pair_score = {}
        if lg == "e":
            if sent_pair.query_boundary(lg,seg_ind):    # If there's a boundary, and ...
                e_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                e_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                f_phrase_1 = sent_pair.query_alignment_dict("e-f",e_phrase_1)
                f_phrase_2 = sent_pair.query_alignment_dict("e-f",e_phrase_2)
                if f_phrase_1 and f_phrase_2:    # If both phrases are aligned:
                    #print("lg:e, Both aligned!")
                    return (sent_pair, TLMO)             # do nothing! Pass completely to preserve Gibbs integrity.
                pair_score["is_a_bound"] = potential(e_phrase_1,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                sent_pair.remove_boundary(lg,seg_ind)    # Then, take scores for not having a boundary... update sent_pair.phrase_map
                new_e_phrase = sent_pair.query_phrase_map(lg,seg_ind)
                if f_phrase_2:                   # If the first phrase is aligned, score new phrase aligned to it
                    sent_pair.remove_alignment((e_phrase_2,f_phrase_2))
                    TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
                    sent_pair.add_alignment((new_e_phrase,f_phrase_2))
                    TLMO.increment_pp_counts((new_e_phrase,f_phrase_2))
                    pair_score["is_not_a_bound"] = potential(new_e_phrase,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((new_e_phrase,f_phrase_2))
                    TLMO.decrement_pp_counts((new_e_phrase,f_phrase_2))
                elif f_phrase_1:
                    sent_pair.remove_alignment((e_phrase_1,f_phrase_1))
                    TLMO.decrement_pp_counts((e_phrase_1,f_phrase_1))
                    sent_pair.add_alignment((new_e_phrase,f_phrase_1))
                    TLMO.increment_pp_counts((new_e_phrase,f_phrase_1))
                    pair_score["is_not_a_bound"] = potential(new_e_phrase,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((new_e_phrase,f_phrase_1))
                    TLMO.decrement_pp_counts((new_e_phrase,f_phrase_1))
                else:    # if e_phrase is not aligned to either f_phrase
                    #print("lg e: triggered elsewhere case")
                    pair_score["is_not_a_bound"] = potential(new_e_phrase,None,e_line,f_line,TLMO,sent_pair,hp)
                bound_prob = pair_score["is_a_bound"]/(pair_score["is_a_bound"]+pair_score["is_not_a_bound"])
                no_bound_prob = pair_score["is_not_a_bound"]/(pair_score["is_a_bound"]+pair_score["is_not_a_bound"])
                #print("lg: e, start:bound", no_bound_prob+bound_prob,no_bound_prob,bound_prob)
                out_conf_decision = np.random.choice(["is_a_bound","is_not_a_bound"],1,p=[bound_prob,no_bound_prob])[0]
                if out_conf_decision == "is_a_bound":
                    # restore boundary
                    if f_phrase_2:
                        sent_pair.add_boundary(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_2,f_phrase_2))
                        TLMO.increment_pp_counts((e_phrase_2,f_phrase_2))
                    elif f_phrase_1:
                        sent_pair.add_boundary(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_1,f_phrase_1))
                        TLMO.increment_pp_counts((e_phrase_1,f_phrase_1))
                    else:
                        sent_pair.add_boundary(lg,seg_ind)
                    return (sent_pair, TLMO)
                elif out_conf_decision == "is_not_a_bound":
                    # keep the boundary removed
                    if f_phrase_2:
                        sent_pair.add_alignment((new_e_phrase,f_phrase_2))
                        TLMO.increment_pp_counts((new_e_phrase,f_phrase_2))
                    elif f_phrase_1:
                        sent_pair.add_alignment((new_e_phrase,f_phrase_1))
                        TLMO.increment_pp_counts((new_e_phrase,f_phrase_1))                    
                    return (sent_pair, TLMO)
            else:    # If the position is not a phrase boundary, and lg is "e":
                # FILL IN!!! WORK HERE!
                e_phrase = sent_pair.query_phrase_map(lg,seg_ind)
                f_phrase = sent_pair.query_alignment_dict("e-f",e_phrase)
                pair_score["is_not_a_bound"] = potential(e_phrase,f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                if f_phrase:
                    sent_pair.remove_alignment((e_phrase,f_phrase))
                    sent_pair.add_boundary(lg,seg_ind)
                    e_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                    e_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                    sent_pair.add_alignment((e_phrase_1,f_phrase))
                    TLMO.increment_pp_counts((e_phrase_1,f_phrase))
                    pair_score["is_a_bound_1"] = potential(e_phrase_1,f_phrase,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,None,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase_1,f_phrase))
                    TLMO.decrement_pp_counts((e_phrase_1,f_phrase))
                    sent_pair.add_alignment((e_phrase_2,f_phrase))
                    TLMO.increment_pp_counts((e_phrase_2,f_phrase))
                    pair_score["is_a_bound_2"] = potential(e_phrase_1,None,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase_2,f_phrase))
                    TLMO.decrement_pp_counts((e_phrase_2,f_phrase))
                    sent_pair.remove_boundary(lg,seg_ind)
                else:
                    sent_pair.add_boundary(lg,seg_ind)
                    e_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                    e_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                    pair_score["is_a_bound_1"] = potential(e_phrase_1,f_phrase,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,None,e_line,f_line,TLMO,sent_pair,hp)
                    pair_score["is_a_bound_2"] = potential(e_phrase_1,None,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_boundary(lg,seg_ind)
                bound_1_prob = pair_score["is_a_bound_1"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                bound_2_prob = pair_score["is_a_bound_2"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                no_bound_prob = pair_score["is_not_a_bound"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                #print("lg: e, start:no_bound", no_bound_prob+bound_1_prob+bound_2_prob,no_bound_prob,bound_1_prob,bound_2_prob)
                out_conf_decision = np.random.choice(["is_a_bound_1","is_a_bound_2","is_not_a_bound"],1,p=[bound_1_prob,bound_2_prob,no_bound_prob])[0]
                if out_conf_decision == "is_a_bound_1":
                    # Need to control for whether f_phrase is aligned or not?
                    sent_pair.add_boundary(lg,seg_ind)
                    if f_phrase:
                        e_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                        e_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_1,f_phrase))
                        TLMO.increment_pp_counts((e_phrase_1,f_phrase))
                elif out_conf_decision == "is_a_bound_2":
                    # Need to control for whether f_phrase is aligned or not?
                    sent_pair.add_boundary(lg,seg_ind)
                    if f_phrase:
                        e_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                        e_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_2,f_phrase))
                        TLMO.increment_pp_counts((e_phrase_2,f_phrase))
                elif out_conf_decision == "is_not_a_bound":
                    # leave the state as no boundary
                    if f_phrase:
                        sent_pair.add_alignment((e_phrase,f_phrase))
                        TLMO.increment_pp_counts((e_phrase,f_phrase))
                return (sent_pair, TLMO)
        elif lg == "f":
            if sent_pair.query_boundary(lg,seg_ind):
                f_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                f_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                e_phrase_1 = sent_pair.query_alignment_dict("f-e",f_phrase_1)
                e_phrase_2 = sent_pair.query_alignment_dict("f-e",f_phrase_2)
                if e_phrase_1 and e_phrase_2:    # If both phrases are aligned:
                    #print("lg:f, Both aligned!")
                    return (sent_pair, TLMO)             # do nothing! Pass completely to preserve Gibbs integrity.
                pair_score["is_a_bound"] = potential(e_phrase_1,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase_2,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                sent_pair.remove_boundary(lg,seg_ind)    # Then, take scores for not having a boundary... update sent_pair.phrase_map
                new_f_phrase = sent_pair.query_phrase_map(lg,seg_ind)
                if e_phrase_2:                   # If the first phrase is aligned, score new phrase aligned to it
                    sent_pair.remove_alignment((e_phrase_2,f_phrase_2))
                    TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
                    sent_pair.add_alignment((e_phrase_2,new_f_phrase))
                    TLMO.increment_pp_counts((e_phrase_2,new_f_phrase))
                    pair_score["is_not_a_bound"] = potential(e_phrase_2,new_f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase_2,new_f_phrase))
                    TLMO.decrement_pp_counts((e_phrase_2,new_f_phrase))
                elif e_phrase_1:
                    sent_pair.remove_alignment((e_phrase_1,f_phrase_1))
                    TLMO.decrement_pp_counts((e_phrase_1,f_phrase_1))
                    sent_pair.add_alignment((e_phrase_1,new_f_phrase))
                    TLMO.increment_pp_counts((e_phrase_1,new_f_phrase))
                    pair_score["is_not_a_bound"] = potential(e_phrase_1,new_f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase_1,new_f_phrase))
                    TLMO.decrement_pp_counts((e_phrase_1,new_f_phrase))
                else:   # if neither f_phrase is aligned
                    pair_score["is_not_a_bound"] = potential(None,new_f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                bound_prob = pair_score["is_a_bound"]/(pair_score["is_a_bound"]+pair_score["is_not_a_bound"])
                no_bound_prob = pair_score["is_not_a_bound"]/(pair_score["is_a_bound"]+pair_score["is_not_a_bound"])
                #print("lg: f, start:bound", no_bound_prob+bound_prob,no_bound_prob,bound_prob)
                out_conf_decision = np.random.choice(["is_a_bound","is_not_a_bound"],1,p=[bound_prob,no_bound_prob])[0]
                if out_conf_decision == "is_a_bound":
                    # restore boundary
                    if e_phrase_2:
                        sent_pair.add_boundary(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_2,f_phrase_2))
                        TLMO.increment_pp_counts((e_phrase_2,f_phrase_2))
                    elif e_phrase_1:
                        sent_pair.add_boundary(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase_1,f_phrase_1))
                        TLMO.increment_pp_counts((e_phrase_1,f_phrase_1))
                    else:
                        sent_pair.add_boundary(lg,seg_ind)
                    return (sent_pair, TLMO)
                elif out_conf_decision == "is_not_a_bound":
                    # keep the boundary removed
                    if e_phrase_2:
                        sent_pair.add_alignment((e_phrase_2,new_f_phrase))
                        TLMO.increment_pp_counts((e_phrase_2,new_f_phrase))
                    elif e_phrase_1:
                        sent_pair.add_alignment((e_phrase_1,new_f_phrase))
                        TLMO.increment_pp_counts((e_phrase_1,new_f_phrase))
                    return (sent_pair, TLMO)
            else:    # If the position is not a phrase boundary, and lg is "f":
                f_phrase = sent_pair.query_phrase_map(lg,seg_ind)
                e_phrase = sent_pair.query_alignment_dict("f-e",f_phrase)
                pair_score["is_not_a_bound"] = potential(e_phrase,f_phrase,e_line,f_line,TLMO,sent_pair,hp)
                if e_phrase:
                    sent_pair.remove_alignment((e_phrase,f_phrase))
                    sent_pair.add_boundary(lg,seg_ind)
                    f_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                    f_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                    sent_pair.add_alignment((e_phrase,f_phrase_1))
                    TLMO.increment_pp_counts((e_phrase,f_phrase_1))
                    pair_score["is_a_bound_1"] = potential(e_phrase,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(None,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase,f_phrase_1))
                    TLMO.decrement_pp_counts((e_phrase,f_phrase_1))
                    sent_pair.add_alignment((e_phrase,f_phrase_2))
                    TLMO.increment_pp_counts((e_phrase,f_phrase_2))
                    pair_score["is_a_bound_2"] = potential(None,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_alignment((e_phrase,f_phrase_2))
                    TLMO.decrement_pp_counts((e_phrase,f_phrase_2))
                    sent_pair.remove_boundary(lg,seg_ind)
                else:
                    sent_pair.add_boundary(lg,seg_ind)
                    f_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                    f_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                    pair_score["is_a_bound_1"] = potential(e_phrase,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(None,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                    pair_score["is_a_bound_2"] = potential(None,f_phrase_1,e_line,f_line,TLMO,sent_pair,hp)*potential(e_phrase,f_phrase_2,e_line,f_line,TLMO,sent_pair,hp)
                    sent_pair.remove_boundary(lg,seg_ind)
                bound_1_prob = pair_score["is_a_bound_1"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                bound_2_prob = pair_score["is_a_bound_2"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                no_bound_prob = pair_score["is_not_a_bound"]/(pair_score["is_a_bound_1"]+pair_score["is_a_bound_2"]+pair_score["is_not_a_bound"])
                #print("lg: f, start:no_bound", no_bound_prob+bound_1_prob+bound_2_prob,no_bound_prob,bound_1_prob,bound_2_prob)
                out_conf_decision = np.random.choice(["is_a_bound_1","is_a_bound_2","is_not_a_bound"],1,p=[bound_1_prob,bound_2_prob,no_bound_prob])[0]
                if out_conf_decision == "is_a_bound_1":
                    sent_pair.add_boundary(lg,seg_ind)
                    if e_phrase:
                        f_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                        f_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase,f_phrase_1))
                        TLMO.increment_pp_counts((e_phrase,f_phrase_1))
                    return (sent_pair, TLMO)
                elif out_conf_decision == "is_a_bound_2":
                    sent_pair.add_boundary(lg,seg_ind)
                    if e_phrase:
                        f_phrase_1 = sent_pair.query_phrase_map(lg,seg_ind-1)
                        f_phrase_2 = sent_pair.query_phrase_map(lg,seg_ind)
                        sent_pair.add_alignment((e_phrase,f_phrase_2))
                        TLMO.increment_pp_counts((e_phrase,f_phrase_2))
                    return (sent_pair, TLMO)
                elif out_conf_decision == "is_not_a_bound":
                    # leave the state as no boundary
                    if e_phrase:
                        sent_pair.add_alignment((e_phrase,f_phrase))
                        TLMO.increment_pp_counts((e_phrase,f_phrase))
                    return (sent_pair, TLMO)
    
    def TOGGLE(trg_ph,src_ph,e_line,f_line,sent_pair,TLMO,hp):
        """
        There seems to be a bug here where TOGGLE consistently lowers the number of aligned phrase pairs, if allowed
        to run enough time. 
        """
        pair_score = dict()
        # create Markov blanket to start
        if (trg_ph,src_ph) in sent_pair.get_alignments():
            TLMO.decrement_pp_counts((trg_ph,src_ph))
            sent_pair.remove_alignment((trg_ph,src_ph))
        # calculate score for no alignment
        pair_score["unaligned"] = potential(trg_ph,src_ph,e_line,f_line,TLMO,sent_pair,hp)
        # calculate score for both phrases aligned
        TLMO.increment_pp_counts((trg_ph,src_ph))
        sent_pair.add_alignment((trg_ph,src_ph))
        pair_score["aligned"] = potential(trg_ph,src_ph,e_line,f_line,TLMO,sent_pair,hp)
        TLMO.decrement_pp_counts((trg_ph,src_ph))
        sent_pair.remove_alignment((trg_ph,src_ph))
        # now back to Markov blanket
        unaligned_prob = pair_score["unaligned"]/(pair_score["unaligned"]+pair_score["aligned"])
        aligned_prob = pair_score["aligned"]/(pair_score["unaligned"]+pair_score["aligned"])
        #print(unaligned_prob+aligned_prob,unaligned_prob,aligned_prob)
        out_conf_decision = np.random.choice(["unaligned","aligned"],1,p=[unaligned_prob,aligned_prob])[0]
        if out_conf_decision == "unaligned":
            sent_pair.update_alignment_dict()
            return (sent_pair, TLMO)
        elif out_conf_decision == "aligned":
            sent_pair.add_alignment((trg_ph,src_ph))
            TLMO.increment_pp_counts((trg_ph,src_ph))
            sent_pair.update_alignment_dict()
            return (sent_pair, TLMO)
    
    
    # Main code for collapsed Gibbs sampler
    with open(srctext_file,"r") as srctext_f, \
    open(trgtext_file,"r") as trgtext_f:
        for sent_ind, (fline, eline) in enumerate(zip(srctext_f,trgtext_f)):
            #print(sent_ind)
            sent_pair = TLMO.get_sent_pair(sent_ind)
            e_line = eline.strip(" \n\r").split()
            f_line = fline.strip(" \n\r").split()
            e_prev = None
            for f_ind, f_word in enumerate(f_line):
                for e_ind, e_word in enumerate(e_line):
                    # Apply SWAP, if applicable
                    # Not done: CHECK TO ENSURE THAT WE'RE NOT MISSING ANY PHRASES IN EITHER LANGUAGE!
                    e_phrase = sent_pair.query_phrase_map("e",e_ind)
                    f_phrase = sent_pair.query_alignment_dict("e-f",e_phrase)
                    if f_phrase and (e_phrase != e_prev):
                        sorted_e_ph = sorted(sent_pair.get_e_phrases())
                        next_e_ind = (sorted_e_ph.index(e_phrase))+1
                        if next_e_ind <= len(sorted_e_ph)-1:
                            e_phrase_2 = sorted_e_ph[next_e_ind]
                            f_phrase_2 = sent_pair.query_alignment_dict("e-f",e_phrase_2)
                            # actual call to SWAP.
                            if f_phrase_2:
                                sent_pair, TLMO = SWAP(sent_ind,e_phrase,f_phrase,e_phrase_2,f_phrase_2,e_line,f_line,sent_pair,TLMO,hp)
                                #print("SWAP: ",sent_pair.get_aligned()["f"])
                    del f_phrase
                    # Apply FLIP, if applicable
                    if e_ind != 0:
                        sent_pair, TLMO = FLIP("e",e_ind,e_line,f_line,sent_pair,TLMO,hp)
                    if f_ind != 0:
                        sent_pair, TLMO = FLIP("f",f_ind,e_line,f_line,sent_pair,TLMO,hp)
                    # Apply TOGGLE, if applicable
                    f_phrase = sent_pair.query_phrase_map("f",f_ind)
                    if f_ind != 0:
                        f_prev = sent_pair.query_phrase_map("f",f_ind-1)
                    if e_phrase != e_prev:
                        if ((e_phrase,f_phrase) in sent_pair.get_alignments()) or (not e_phrase and not f_phrase):
                            sent_pair, TLMO = TOGGLE(e_phrase,f_phrase,e_line,f_line,sent_pair,TLMO,hp)
                            #print("TOGGLE: ",sent_pair.get_aligned()["f"])
                    # Apply FLIP TWO, if applicable
                    
                    # Apply MOVE, if applicable
                    
                    e_prev = e_phrase
#            final_probability = 0   # PLACEHOLDER - fill with model calculation.
    return TLMO

In [43]:
def print_alignments(TLMO):
    sent_pair_dict = TLMO.get_sent_pair_dict()
    for i in sent_pair_dict:
        print(str(i)+": ")
        alignments = sent_pair_dict[i].get_alignments()
        for (e, f) in sorted(alignments,key=lambda x:x[0][0][0]):
            print("\t",e,"\t",f)

In [60]:
if __name__ == "main":
    # Use argparse to allow user to enter hyperparameters and input files
    hp = Hyperparameters(p_S,b,p_phi,alpha,p_s)
    TLMO = TextLevelMutableObjects()
    TLMO = process_IBM_Model_1_data(src_vocab_file,trg_vocab_file,src_trg_alignment_file,trg_src_alignment_file,TLMO)
    TLMO = initialize_tlmo(srctext_file,trgtext_file,alignments_file,TLMO)

In [74]:
src_vocab_file = "../data/example/two_line/f-srctext-spa.vcb"
trg_vocab_file = "../data/example/two_line/e-trgtext-eng.vcb"
src_trg_alignment_file = "../data/example/two_line/f_e_word_alignment_probs.txt"
trg_src_alignment_file = "../data/example/two_line/e_f_word_alignment_probs.txt"
srctext_file = "../data/example/two_line/f-srctext-spa.txt"
trgtext_file = "../data/example/two_line/e-trgtext-eng.txt"
alignments_file = "../data/example/two_line/berkeley_alignments_spa_eng.txt"

hp = Hyperparameters(0.1,0.85,10e-10,100,0.8)
TLMO = TextLevelMutableObjects()
TLMO = process_IBM_Model_1_data(src_vocab_file,trg_vocab_file,src_trg_alignment_file,trg_src_alignment_file,TLMO)
TLMO = initialize_tlmo(srctext_file,trgtext_file,alignments_file,TLMO)
TLMO = collapsed_gibbs_aligner(srctext_file,trgtext_file,TLMO,hp)
# Need to resolve bug in which FLIP is assigning 99%-100% probability to "is_not_a_bound" in almost every case! Why?
# Need to resolve bug in which potential() is being asked (in FLIP) to evaluate a <None, None> "phrase" pair.
#print_alignments(TLMO)

In [146]:
new = (((15, '.'),), None)
print(new[0][0][0],len(new[0][0][0]),type(new[0][0][0]))

TypeError: object of type 'int' has no len()

In [None]:
def extract_phrases(srctext,trgtext,alignment_output_line):
    """
    Helper Function:
    Extracts phrases from the alignment output for a single sentence.
    For unaligned phrases, creates a phrase pair where null side is None.
    """
    srclen = len(srctext.split())
    trglen = len(trgtext.split())
    srctext_lst = srctext.split()
    trgtext_lst = trgtext.split()
    src_dict = {}
    trg_dict = {}
    for i, word in enumerate(srctext_lst):
        src_dict[i] = word
    for i, word in enumerate(trgtext_lst):
        trg_dict[i] = word
    f_aligned = set()
    e_aligned = set()
    line_lst = alignment_output_line.split()
    alignment = list()
    for pair in line_lst:
        [f, e] = pair.split("-")
        f_aligned.add(int(f))
        e_aligned.add(int(e))
        alignment.append((int(f),int(e)))
    alignment = sorted(alignment)
    
    phrase_set = set()
    phrase = set()
    prev_e = 0
    prev_f = 0
    for f,e in alignment:
        if abs(e - prev_e) > 1:
            phrase_set.add(frozenset(phrase))
            phrase = set([(e,f)])
        elif abs(f - prev_f) > 1:
            phrase_set.add(frozenset(phrase))
            phrase = set([(e,f)])
        else:
            phrase.add((e,f))
        prev_e = e
        prev_f = f
    phrase_set.add(frozenset(phrase))

    full_phrases = []
    for i in sorted(phrase_set):
        if len(i) > 0:
            phrase_e = set()
            phrase_f = set()
            for pair in sorted(i):
                phrase_e.add((pair[0],trg_dict[pair[0]]))
                phrase_f.add((pair[1],src_dict[pair[1]]))
            phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
            full_phrases.append(phrase_pair)
    
    unaligned_phrases = []
    unaligned_phrase = []
    for i, word in enumerate(srctext_lst):
        print(i,word)
        if i not in f_aligned:
            unaligned_phrase.append((i,src_dict[i]))
        else:
            if len(unaligned_phrase) != 0:
                print(unaligned_phrase)
                unaligned_phrases.append((None,tuple(sorted(unaligned_phrase))))
                unaligned_phrase = []
    if unaligned_phrase != []:
        unaligned_phrases.append((None,tuple(sorted(unaligned_phrase))))
        unaligned_phrase = []
    for i, word in enumerate(trgtext_lst):
        print(i,word)
        if i not in e_aligned:
            unaligned_phrase.append((i,trg_dict[i]))
        else:
            if len(unaligned_phrase) != 0:
                print(unaligned_phrase)
                unaligned_phrases.append((tuple(sorted(unaligned_phrase)),None))
                unaligned_phrase = []
    if unaligned_phrase != []:
        unaligned_phrases.append((tuple(sorted(unaligned_phrase)),None))
        unaligned_phrase = []
        
    full_phrases.extend(unaligned_phrases)
    
    full_phrase_set = set()
    for i in full_phrases:
        new_e = tuple(i[0]) if i[0] else None
        new_f = tuple(i[1]) if i[1] else None
        new_phrase = (new_e, new_f)
        full_phrase_set.add(new_phrase)

    return full_phrase_set

srctext = "Judá fue padre de Fares y de Zérah , y su madre fue Tamar . Fares fue padre de Hesrón y éste de Aram ."
trgtext = "Judah and Tamar were the father and mother of Perez and Zerah . Perez was the father of Hezron , Hezron the father of Ram ,"
alignment_output_line = "0-0 17-16 7-11 16-14 5-10 13-2 15-13 4-9 14-12 3-8 13-11 23-24 22-23 11-7 9-6 19-20 19-18 18-17"
full_phrase_set = extract_phrases(srctext,trgtext,alignment_output_line)
for i in full_phrase_set:
    print(i)