In [2]:
from collections import Counter
from collections import defaultdict as dd
import copy
import unicodecsv as csv
import numpy as np

In [9]:
class Hyperparameters:
    def __init__(self,p_S,b,p_phi,alpha,p_s):
        self.p_S = p_S
        self.b = b
        self.p_phi = p_phi
        self.alpha = alpha
        self.p_s = p_s
        
    def get_p_S():
        return self.p_S
    def get_b():
        return self.b
    def get_p_phi():
        return self.p_phi
    def get_alpha():
        return self.alpha
    def get_p_s():
        return self.p_s

    def set_p_S(new):
        self.p_S = new
    def set_b(new):
        self.b = new
    def set_p_phi(new):
        self.p_phi = new
    def set_alpha(new):
        self.alpha = new
    def set_p_s(new):
        self.p_s = new

In [10]:
# To use later to improve code - make each sentence pair an object, with easy ways to modify
# phrases and alignments
class SentencePair:
    def __init__(self,ind,e_sent,f_sent,phrase_pair_set):
        self.ind = ind                # the line number of the sentence in the parallel text
        self.e_sent = e_sent          # text of english ("target") sentence
        self.f_sent = f_sent          # text of foreign ("source") sentence
        self.phrases, self.alignment, self.aligned = build_ph_align_objs(phrase_pair_set)
        self.phrase_map = build_phrase_map(self,self.phrases)
        self.alignment_dict = build_alignment_dict(self,self.alignment)
    
    def build_ph_align_objs(phrase_pair_set):
        """
        Assumes pairs in the form of tuples of (e_phrase, f_phrase) = 
        ( ((0,"Michael"),(1,"assumes")), ((0,"Michael"),(1,"geht"),(1,"davon"),(1,"aus")) )
        """
        phrases = dd(set)
        alignment = set()
        aligned = dd(set)
        boundaries = dd(set)
        for pair in phrase_pair_set:
            if pair[0]:
                phrases["e"].add(pair[0])
                boundaries["e"].add(pair[0][0][0])
            if pair[1]:
                phrases["f"].add(pair[1])
                boundaries["f"].add(pair[1][0][0])
            if pair[0] and pair[1]:
                alignment.add((pair[0],pair[1]))
                for (ind, word) in pair[0]:
                    aligned["e"].add(ind)
                for (ind, word) in pair[1]:
                    aligned["f"].add(ind)
        return phrases, alignment, aligned
    
    def build_phrase_map(self,self.phrases):
        phrase_map = {"e":dict(), "f":dict()}
        for lg in set(["e","f"]):
            for phrase in self.phrases[lg]:
                for (ind, word) in phrase:
                    phrase_map[lg][ind] = phrase
                    
    def update_phrase_map(self):
        self.phrase_map = build_phrase_map(self.phrases)
        
    def query_phrase_map(self,lg,index):
        return self.phrase_map[lg][index]
    
    def build_alignment_dict(self,self.alignment):
        alignment_dict = {"e-f":dict(), "f-e":dict()}
        for pair in self.alignment:
            alignment_dict["e-f"][pair[0]] = pair[1]
            alignment_dict["f-e"][pair[1]] = pair[0]
        return alignment_dict
    
    def update_alignment_dict(self)
        self.alignment_dict = build_alignment_dict(self,self.alignment)
        
    def query_alignment_dict(self,direction,phrase):
        """
        direction must be either "e-f" or "f-e"
        """
        if phrase in self.alignment_dict[direction]:
            return self.alignment_dict[direction][phrase]
        else:
            return None
    
    def get_ind(self):
        return self.ind
    
    def get_e_sent(self):
        return self.e_sent
    
    def get_f_sent(self):
        return self.f_sent
    
    def get_e_ph(self):
        return self.e_ph
    
    def get_f_ph(self):
        return self.f_ph
    
    def get_alignment(self):
        return self.alignment
    
    def get_aligned(self):
        return self.aligned
    
    def remove_alignment(self,pair):
        """
        pair must be a tuple of the shape (e_phrase,f_phrase)
        """
        self.alignment.remove(pair)
        for (ind,word) in pair[0]:
            self.aligned["e"].remove(ind)
        for (ind,word) in pair[1]:
            self.aligned["f"].remove(ind)
    
    def add_alignment(self,pair):
        """
        pair must be a tuple of the shape (e_phrase,f_phrase)
        """
        self.alignment.add(pair)
        for (ind,word) in pair[0]:
            self.aligned["e"].add(ind)
        for (ind,word) in pair[1]:
            self.aligned["f"].add(ind)
            
    def query_alignment_e(self,e_phrase)
            
    def remove_boundary(self,lg,bound):
        """
        lg must be either the string "e" or "f" (english or foreign)
        bound must be an integer
        """
        self.boundaries[lg].remove(bound)
        for phrase in self.phrases[lg]:
            for (ind,word) in phrase:
                if ind == bound-1:
                    first_phrase = phrase
                if ind == bound:
                    second_phrase = phrase
                if first_phrase and second_phrase:
                    break
        self.phrases[lg].remove(first_phrase)
        self.phrases[lg].remove(second_phrase)
        new_phrase = (*first_phrase,*second_phrase)
        self.phrases[lg].add(new_phrase)
        
    def add_boundary(self,lg,bound):
        """
        lg must be either the string "e" or "f" (english or foreign)
        bound must be an integer
        """
        self.boundaries[lg].add(bound)
        for phrase in self.phrases[lg]:
            for (ind, word) in phrase:
                if ind == bound:
                    target_phrase = phrase
                    break
        first_phrase = list()
        second_phrase = list()
        for (ind, word) in target_phrase:
            if ind < bound:
                first_phrase.append((ind, word))
            else:   # elif ind >= bound
                second_phrase.append((ind, word))
        self.phrases[lg].remove(target_phrase)
        self.phrases[lg].add(tuple(first_phrase))
        self.phrases[lg].add(tuple(second_phrase))

In [11]:
class TextLevelMutableObjects:
    def __init__():
        self.vocab = dd(set)
        self.pp_counts = Counter()      # pp_counts = phrase pair counts
        self.P_WA = dd(dict)
        self.lowest_WA_prob = 1.0       # initialize at highest possible probability to eventually decrease it
        self.sent_pair_dict = dict()
    
    def get_n_e(self):
        return len(self.vocab["e"])
    
    def get_n_f(self):
        return len(self.vocab["f"])
    
    def get_vocab(self):
        return self.vocab
    
    def vocab_add(self,lg,word):
        """
        lg must be "e" or "f", though this isn't enforced right now.
        No reason to have a counterpart remove method right now...
        """
        self.vocab[lg].add(word)
        
    def init_vocab_from_text(self,lg,text):
        for word in text.split():
            self.vocab[lg].add(word)
    
    def get_pp_counts(self):
        return self.pp_counts
        
    def decrement_pp_counts(self,phrase_pair):
        self.pp_counts[phrase_pair] -= 1
    
    def increment_pp_counts(self,phrase_pair):
        self.pp_counts[phrase_pair] += 1
    
    def get_pp_count(self,phrase_pair):
        return self.pp_counts[phrase_pair]
    
    def init_pp_counts_from_pair_set(self,phrase_pair_set):
        """
        Careful! This assumes that these are true phrase pairs, where both sides are non-null.
        Guard against violations of this if necessary.
        """
        for pair in phrase_pair_set:
            self.pp_counts[pair] += 1
            for (ind, word) in pair[0]:
                self.vocab["e"].add(word)
            for (ind, word) in pair[1]:
                self.vocab["f"].add(word)
    
    def get_lowest_WA_prob(self):
        return self.lowest_WA_prob
    
    def set_lowest_WA_prob(self,new_val):
        self.lowest_WA_prob = new_val
        
    def get_P_WA(self,word1,word2):
        return self.P_WA[word1][word2]
    
    def set_P_WA(self,word1,word2,prob):
        self.P_WA[word1][word2] = prob
        
    def get_sent_pair_dict(self):
        return self.sent_pair_dict
    
    def get_sent_pair(self,index):
        return self.sent_pair_dict[index]
    
    def set_sent_pair(self,index,SentPair):
        """
        SentPair should be a formal SentPair object
        """
        self.sent_pair_dict[index] = SentPair

In [None]:
def process_IBM_Model_1_data(src_vocab_file,trg_vocab_file,src_trg_alignment_file,trg_src_alignment_file,TLMO):
    """
    Takes the output of IBM Model 1 alignments from GIZA++ and creates a lookup
    table for the translation probabilities of f|e and e|f.
    Here, src = foreign (f), trg = english (e)
    Returns a lookup dictionary of the form P_WA[word1][word2] = alignment probability of word1|word2
    # Is this correct given how GIZA++ does things?
    TLMO = TextLevelMutableObject for holding P_WA, lowest_WA_prob
    """
    src_lut = {}
    trg_lut = {}
    with open(trg_vocab_file,"rb") as tvf:
        tv = csv.reader(tvf,delimiter=" ",encoding="utf-8")
        for line in tv:
            if len(line) == 3:
                trg_lut[int(line[0])] = line[1]
                TLMO.vocab_add("e",line[1])
    with open(src_vocab_file,"rb") as svf:
        sv = csv.reader(svf,delimiter=" ",encoding="utf-8")
        for line in sv:
            if len(line) == 3:
                src_lut[int(line[0])] = line[1]
                TLMO.vocab_add("f",line[1])
    with open(trg_src_alignment_file,"rb") as tsaf:
        tsa = csv.reader(tsaf,delimiter=" ",encoding="utf-8")
        for line in tsa:
            if len(line) == 3:
                if int(line[0]) in trg_lut and int(line[1]) in src_lut:
                    word1 = trg_lut[int(line[0])]
                    word2 = src_lut[int(line[1])]
                    prob = float(line[2])
                    TLMO.set_P_WA[word1][word2] = prob
                    if prob < TLMO.get_lowest_WA_prob():
                        TLMO.set_lowest_WA_prob(prob)
    with open(src_trg_alignment_file,"rb") as staf:
        sta = csv.reader(staf,delimiter=" ",encoding="utf-8")
        for line in sta:
            if len(line) == 3:
                if int(line[0]) in src_lut and int(line[1]) in trg_lut:
                    word1 = src_lut[int(line[0])]
                    word2 = trg_lut[int(line[1])]
                    prob = float(line[2])
                    TLMO.set_P_WA[word1][word2] = prob
                    if prob < TLMO.get_lowest_WA_prob():
                        TLMO.set_lowest_WA_prob(prob)
    return TLMO

In [4]:
def initialize_tlmo(srctext_file,trgtext_file,alignments_file,TLMO):
    """
    For now, these are the input files to the Berkeley aligner, as well as 
    the output training.align file for alignments_file.
    Assumes that the two text input files have the same number of lines, and that these all have alignment output.
    Assume src = Foreign (f) and trg = English (e)
    """
    def extract_phrases(srctext,trgtext,alignment_output_line):
        """
        Helper Function:
        Extracts phrases from the alignment output for a single sentence.
        For unaligned phrases, creates a phrase pair where null side is None.
        """
        srclen = len(srctext.split())
        trglen = len(trgtext.split())
        srctext_lst = srctext.split()
        trgtext_lst = trgtext.split()
        src_dict = {}
        trg_dict = {}
        for i, word in enumerate(srctext.split()):
            src_dict[i] = word
        for i, word in enumerate(trgtext.split()):
            trg_dict[i] = word
        f_aligned = set()
        e_aligned = set()
        line_lst = alignment_output_line.split()
        alignment = list()
        for pair in line_lst:
            [f, e] = pair.split("-")
            f_aligned.add(int(f))
            e_aligned.add(int(e))
            alignment.append((int(f),int(e)))
        alignment = sorted(alignment)

        phrase_set = set()
        phrase = set()
        prev_e = 0
        prev_f = 0
        for f,e in alignment:
            if abs(e - prev_e) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            elif abs(f - prev_f) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            else:
                phrase.add((e,f))
            prev_e = e
            prev_f = f
        phrase_set.add(frozenset(phrase))

        full_phrases = []
        for i in sorted(phrase_set):
            if len(i) > 0:
                phrase_e = set()
                phrase_f = set()
                for pair in sorted(i):
                    phrase_e.add((pair[0],trg_dict[pair[0]]))
                    phrase_f.add((pair[1],src_dict[pair[1]]))
                phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
                full_phrases.append(phrase_pair)
        unaligned_phrases = []
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[0][0])):
            #print(i,phrase_pair[0],phrase_pair[0][-1][0],phrase_pair[1])
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[0][-1][0] != phrase_pair[0][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[0][-1][0]+1,phrase_pair[0][0][0]):
                        unaligned_sequence.append((k,trgtext_lst[k]))
                    unaligned_phrases.append([unaligned_sequence,None])
                prev_pair = phrase_pair
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[1][0])):
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[1][-1][0] != phrase_pair[1][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[1][-1][0]+1,phrase_pair[1][0][0]):
                        unaligned_sequence.append((k,srctext_lst[k]))
                    unaligned_phrases.append([None,unaligned_sequence])
                prev_pair = phrase_pair
        full_phrases.extend(unaligned_phrases)
        
        full_phrase_set = set()
        for i in full_phrases:
            new_e = tuple(i[0]) if i[0] else None
            new_f = tuple(i[1]) if i[1] else None
            new_phrase = (new_e, new_f)
            full_phrase_set.add(new_phrase)
            
        return full_phrase_set
        # end function extract_phrases
    
    # Main code within function initialize_tlmo()
    with open(srctext_file,"r",encoding="utf-8") as srctext_f, \
    open(trgtext_file,"r",encoding="utf-8") as trgtext_f, \
    open(alignments_file,"r") as alignments_f:
        for i, (sline,tline,aline) in enumerate(zip(srctext_f,trgtext_f,alignments_f)):
            src = sline.strip("\n\r")
            trg = tline.strip("\n\r")
            A = aline.strip("\n\r")
            full_phrase_set = extract_phrases(src,trg,A)
            sent_pair = SentencePair(i,trg,src,full_phrase_set)
            TLMO.init_vocab_from_text("e",trg)
            TLMO.init_vocab_from_text("f",src)
            TLMO.init_pp_counts_from_pair_set(sent_pair.get_alignment())
            TLMO.set_sent_pair(sent_pair)
    
    return TLMO

In [None]:
def collapsed_gibbs_aligner(srctext_file,trgtext_file,TLMO,hp):
    """
    Will want to repeatedly call this for N iterations.
    """
    # Supporting functions:
    def p_f(src_phrase,n_f):
        return (p_s*(1-p_s)**(len(src_phrase)))*((1/n_f)**(len(src_phrase)))
    
    def p_e(trg_phrase,n_e):
        return (p_s*(1-p_s)**(len(trg_phrase)))*((1/n_e)**(len(trg_phrase)))
    
    def calc_P_WA_trg_src(sent_ind, trg_phrase, sline):
        """
        Expects both phrase arguments to be a list of 2-tuples, e.g.
        if looking for P_WA(e|f), as in this function, trg_phrase is [(0,"Michael"),(1,"assumes")]
        and sline is the source sentence in the form of a list: ["Michael", "geht", "davon", "aus", ...]
        Should use dynamic programming to store calculations of phrase probabilities
        based on IBM Model 1 word alignment probabilities.
        """
        for i, pair in enumerate(trg_phrase):
            # trg_phrase = ((0, 'This'), (1, 'is'))
            # i = 0, pair = (0, 'This')
            # i = 1, pair = (1, 'is')
            if i in trg_align_dict[sent_ind][pair[0]]:
                for word_ind in trg_align_dict[sent_ind][pair[0]]:
                    if i == 0:
                        if pair[1] in P_WA:
                            if sline[word_ind] in P_WA[pair[1]]:
                                P_WA_value = P_WA[pair[1]][sline[word_ind]]
                            else:
                                P_WA_value = lowest_prob
                        else:
                            P_WA_value = lowest_prob
                    else:
                        if pair[1] in P_WA:
                            if sline[word_ind] in P_WA[pair[1]]:
                                P_WA_value *= P_WA[pair[1]][sline[word_ind]]
                            else:
                                P_WA_value *= lowest_prob
                        else:
                            P_WA_value *= lowest_prob
            else:
                P_WA_value = lowest_prob
        return P_WA_value
    
    def calc_P_WA_src_trg(sent_ind, src_phrase, tline):
        """
        Expects both phrase arguments to be a list of 2-tuples, e.g.
        if looking for P_WA(f|e), as in this function, src_phrase is [(0,"Michael"),(1,"geht"),(2,"davon"),(3,"aus")]
        and tline is the target sentence in the form of a list: ["Michael", "assumes", ...]
        Should use dynamic programming to store calculations of phrase probabilities
        based on IBM Model 1 word alignment probabilities. 
        """
        for i, pair in enumerate(src_phrase):
            if i in src_align_dict[sent_ind][pair[0]]:
                for word_ind in src_align_dict[sent_ind][pair[0]]:
                    if i == 0:
                        if pair[1] in P_WA:
                            if tline[word_ind] in P_WA[pair[1]]:
                                P_WA_value = P_WA[pair[1]][tline[word_ind]]
                            else:
                                P_WA_value = lowest_prob
                        else:
                            P_WA_value = lowest_prob
                    else:
                        if pair[1] in P_WA:
                            if tline[word_ind] in P_WA[pair[1]]:
                                P_WA_value *= P_WA[pair[1]][tline[word_ind]]
                            else:
                                P_WA_value *= lowest_prob
                        else:
                            P_WA_value *= lowest_prob
            else:
                P_WA_value = lowest_prob
        return P_WA_value
    
    def delta(trg_phrase,src_phrase,tline,sline):
        """

        """
        s = float(len(tline))/float(len(sline))
        delta = b**abs(trg_phrase[0][0]-(src_phrase[0][0]*s))
        return delta
    
    def tau(trg_phrase,src_phrase,phrase_counts):
        tau_value = (phrase_counts[(trg_phrase,src_phrase)] + \
                     (alpha * ((p_f(src_phrase,n_f) * \
                              calc_P_WA_trg_src(sent_ind, trg_phrase, sline)) * \
                             (p_e(trg_phrase,n_e) * \
                             calc_P_WA_src_trg(sent_ind, src_phrase, tline))))) / \
        (len(phrase_counts) + alpha)
        return tau_value
    
    def theta_N(trg_phrase,src_phrase):
        if trg_phrase and not src_phrase:
            theta_N_value = 0.5*p_e(trg_phrase,n_e)
        elif src_phrase and not trg_phrase:
            theta_N_value = 0.5*p_f(src_phrase,n_f)
        return theta_N_value
    
    def objective(trg_phrase,src_phrase,phrase_counts,tline,sline,sent_ind):
        """
        This calculates the potential function's value.
        Watch out! Need to be able to modify to accommodate Markov blanket...
        """
        if src_phrase and trg_phrase:
            obj_value = (1-p_S)*(1-p_phi)*tau(trg_phrase,src_phrase,phrase_counts)*delta(trg_phrase,src_phrase,tline,sline)
        else:
            obj_value = (1-p_S)*p_phi*theta_N(trg_phrase,src_phrase)

        return obj_value
    
    def create_phrase_map(sline,tline,full_phrases):
        """
        full_phrases is the value of a key in the dictionary phrase_table, e.g. phrase_tables[0]
        to get the full_phrases of the first sentence.
        """
        src_phrase_map = {}
        trg_phrase_map = {}
        for phrase_ind, pair in enumerate(full_phrases):
            trg = pair[0]
            src = pair[1]
            if trg:
                for entry in trg:
                    trg_phrase_map[entry[0]] = trg
            if src:
                for entry in src:
                    src_phrase_map[entry[0]] = src
        for src_ind, word in enumerate(sline):
            if src_ind not in src_phrase_map:
                src_phrase_map[src_ind] = None
        for trg_ind, word in enumerate(tline):
            if trg_ind not in trg_phrase_map:
                trg_phrase_map[trg_ind] = None
        return (src_phrase_map, trg_phrase_map)
    
    def SWAP(e_phrase,f_phrase,e_phrase_2,f_phrase_2,sent_pair,TLMO):
        """

        """
        # initialize scorekeeping dict
        pair_score = dict()
        # Set up configurations as tuples. Names of variables will be keys in dict.
        ident = (e_phrase,f_phrase,e_phrase_2,f_phrase_2)
        swapped = (e_phrase,f_phrase_2,e_phrase_2,f_phrase)
        out_conf_dict = {"ident":ident,"swapped":swapped}
        # First, modifying counts and alignments so that TLMO and sent_pair are the Markov blanket.
        TLMO.decrement_pp_counts((e_phrase,f_phrase))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase))
        sent_pair.remove_alignment((e_phrase,f_phrase))
        sent_pair.remove_alignment((e_phrase_2,f_phrase_2))
        # Do I need to decrement the counts for the swapped configuration?
        # Second, calculate objective scores.
        #     First, swapped configuration: increment phrase counts for it, 
        #     then decrement them to restore to Markov blanket
        TLMO.increment_pp_counts((e_phrase,f_phrase_2))
        TLMO.increment_pp_counts((e_phrase_2,f_phrase))
        pair_score["swapped"] = (objective(e_phrase,f_phrase_2,TLMO,sent_pair),
                                 objective(e_phrase_2,f_phrase,TLMO,sent_pair))
        TLMO.decrement_pp_counts((e_phrase,f_phrase_2))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase))
        #     Second, identify configuration: increment phrase_counts for identity, 
        #     then decrement them to restore to Markov blanket
        TLMO.increment_pp_counts((e_phrase,f_phrase))
        TLMO.increment_pp_counts((e_phrase_2,f_phrase_2))
        pair_score["ident"] = (objective(e_phrase,f_phrase,TLMO,sent_pair),
                               objective(e_phrase_2,f_phrase_2,TLMO,sent_pair))
        TLMO.decrement_pp_counts((e_phrase,f_phrase))
        TLMO.decrement_pp_counts((e_phrase_2,f_phrase_2))
        # calculate output configuration probabilities
        ident_prob = (pair_score["ident"][0]*pair_score["ident"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        swapped_prob = (pair_score["swapped"][0]*pair_score["swapped"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        # choose one configuration probabilistically
        out_conf_decision = np.random.choice(["ident","swapped"],1,p=[ident_prob,swapped_prob])[0]
        out_conf = out_conf_dict[out_conf_decision]
        # update phrase_counts, phrase_table depending on out_conf, which is a tuple of (f,e,f,e) phrases
        TLMO.increment_pp_counts((out_conf[0],out_conf[1]))
        TLMO.increment_pp_counts((out_conf[2],out_conf[3]))
        # update alignments
        sent_pair.add_alignment((out_conf[0],out_conf[1]))
        sent_pair.add_alignment((out_conf[2],out_conf[3]))
        TLMO.set_sent_pair(sent_ind,sent_pair)
        
    def FLIP():
        pass
    
    def TOGGLE(src_ph,trg_ph,phrase_counts,phrase_table):
        # WORK HERE NEXT - convert to make reference to fancy new objects!
        pair_score = dict()
        prob_score[] = dict()
        if src_ph_1 != None and trg_ph_1 != None:
            delinked = ((trg_ph_1,None),(None,src_ph_1))
            ident = (trg_ph_1,src_ph_1)
            # Take out phrase_count for this pair to accommodate markov blanket
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            # see what effect delinking (i.e. "unaligning") them has
            pair_score["delinked"] = objective(trg_ph_1,None,phrase_counts,tline,sline,sent_ind)*objective(None,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # Augment phrase_counts to capture "new" pairing (which was originally there, in this case)
            phrase_counts((trg_ph_1,src_ph_1)) += 1
            pair_score["ident"] = objective(trg_ph_1,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # Restore Markov blanket, and await decision as to shape this should take below.
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            prob_score["ident"] = pair_score["ident"]/(pair_score["ident"]+pair_score["delinked"])
            prob_score["delinked"] = pair_score["delinked"]/(pair_score["ident"]+pair_score["delinked"])
            out_conf_decision = np.random.choice(["ident","delinked"],1,p=[prob_score["ident"],prob_score["delinked"]])
            if out_conf_decision == "ident":
                phrase_counts((trg_ph_1,src_ph_1)) += 1
            elif out_conf_decision == "delinked":
                phrase_table[sent_ind].remove((trg_ph_1,src_ph_1))
        elif src_ph_1 == None and trg_ph_1 == None:
            # create Markov blanket
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            # see what effect aligning them has
            aligned = (trg_ph_1,src_ph_1)
            pair_score["delinked"] = objective(trg_ph_1,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # START WORKING HERE ON 1/13
        else:
            pass
        
        return (phrase_counts, phrase_table)
    
    
    # Main code for collapsed Gibbs sampler
    with open(srctext_file,"r") as srctext_f, \
    open(trgtext_file,"r") as trgtext_f:
        for sent_ind, (f_line, e_line) in enumerate(zip(srctext_f,trgtext_f)):
            eline = e_line.strip("\n\r").split()
            fline = f_line.strip("\n\r").split()
            for f_ind, f_word in enumerate(fline):
                for e_ind, e_word in enumerate(eline):
                    # Apply SWAP, if applicable - could remove some of these for-loops with a dictionary 
                    # of phrase alignments.
                    # CHECK TO ENSURE THAT WE'RE NOT MISSING ANY PHRASES IN EITHER LANGUAGE!
                    sent_pair = TLMO.get_sent_pair(sent_ind)
                    e_phrase = sent_pair.query_phrase_map("e",e_ind)
                    f_phrase = sent_pair.query_alignment_dict("e-f",e_ph)
                    if e_prev and e_phrase != e_prev:
                        sorted_e_ph = sorted(sent_pair.get_e_phrases())
                        next_e_ind = (sorted_e_ph.index(e_phrase))+1
                        if next_e_ind <= len(sorted_e_ph):
                            e_phrase_2 = sorted_e_ph[next_e_ind]
                            f_phrase_2 = sent_pair.query_alignment_dict("e-f",e_phrase_2)
                            # actual call to SWAP.
                            SWAP(sent_ind,e_phrase,f_phrase,e_phrase_2,f_phrase_2,sent_pair,TLMO)
                            e_prev = e_phrase
                            f_prev = f_phrase
                    # else, don't do SWAP!

                    # Apply TOGGLE, if applicable
                    #TOGGLE()
                    # Apply FLIP, if applicable
                    
                    
                    # Apply FLIP TWO, if applicable
                    
                    # Apply MOVE, if applicable
                    
#            final_probability = 0   # PLACEHOLDER - fill with model calculation.
    return TLMO

In [14]:
print(sorted((((3,"lol"),(2,"god...")),((1,"michael"),(0,"something else")))))

[((1, 'michael'), (0, 'something else')), ((3, 'lol'), (2, 'god...'))]


In [None]:
if __name__ == "main":
    # Use argparse to allow user to enter hyperparameters and input files
    hp = Hyperparameters(p_S,b,p_phi,alpha,p_s)
    TLMO = TextLevelMutableObjects    # Initialize TLMO to contain what would have been global variables (AHHH!)
    TLMO = process_IBM_Model_1_data(src_vocab_file,trg_vocab_file,src_trg_alignment_file,trg_src_alignment_file,TLMO)
    TLMO = initialize_tlmo(srctext_file,trgtext_file,alignments_file,TLMO)
    