In [1]:
# TODO:
# Make sure that you include unaligned phrases as phrases, not just gaps in coverage.
# It's very important that *everything be in a phrase*, even if it's unaligned...

In [2]:
from collections import Counter
from collections import defaultdict as dd
import copy
import unicodecsv as csv
import numpy as np

In [3]:
def model_one_processing(src_vocab_file,trg_vocab_file,src_trg_alignment,trg_src_alignment):
    """
    Takes the output of IBM Model 1 alignments from GIZA++ and creates a lookup
    table for the translation probabilities of f|e and e|f.
    Here, src = foreign (f), trg = english (e)
    Returns a lookup dictionary of the form P_WA[word1][word2] = alignment probability of word1|word2
    # Is this correct given how GIZA++ does things?
    """
    src_vocab_lut = {}
    trg_vocab_lut = {}
    P_WA = dd(dict)
    lowest_prob = 1.0
    with open(src_vocab_file,"rb") as src_vocab_f:
        src_vocab = csv.reader(src_vocab_f,delimiter=" ",encoding="utf-8")
        for line in src_vocab:
            if len(line) == 3:
                src_vocab_lut[int(line[0])] = line[1]
    with open(trg_vocab_file,"rb") as trg_vocab_f:
        trg_vocab = csv.reader(trg_vocab_f,delimiter=" ",encoding="utf-8")
        for line in trg_vocab:
            if len(line) == 3:
                trg_vocab_lut[int(line[0])] = line[1]
    with open(src_trg_alignment,"rb") as src_trg_alignment_f:
        src_trg_alignment_loop = csv.reader(src_trg_alignment_f,delimiter=" ")
        for line in src_trg_alignment_loop:
            if len(line) == 3:
                if int(line[0]) in src_vocab_lut and int(line[1]) in trg_vocab_lut:
                    P_WA[src_vocab_lut[int(line[0])]][trg_vocab_lut[int(line[1])]] = float(line[2])
                    if float(line[2]) < lowest_prob:
                        lowest_prob = float(line[2])
    with open(trg_src_alignment,"rb") as trg_src_alignment_f:
        trg_src_alignment_loop = csv.reader(trg_src_alignment_f,delimiter=" ")
        for line in trg_src_alignment_loop:
            if len(line) == 3:
                if int(line[0]) in trg_vocab_lut and int(line[1]) in src_vocab_lut:
                    P_WA[trg_vocab_lut[int(line[0])]][src_vocab_lut[int(line[1])]] = float(line[2])
                    if float(line[2]) < lowest_prob:
                        lowest_prob = float(line[2])
    return (P_WA, lowest_prob)

In [4]:
def initialize_phrases_from_text(srctext_file,trgtext_file,alignments_file):
    """
    For now, these are the input files to the Berkeley aligner, as well as 
    the output training.align file for alignments_file.
    Assumes that the two text input files have the same number of lines, and that these all have alignment output.
    Assume src = Foreign (f) and trg = English (e)
    """
    def extract_phrases(srctext,trgtext,alignment_output_line):
        """
        Extracts phrases from the alignment output for a single sentence.
        For unaligned phrases, creates a phrase pair where null side is None.
        """
        srclen = len(srctext.split())
        trglen = len(trgtext.split())
        srctext_lst = srctext.split()
        trgtext_lst = trgtext.split()
        src_dict = {}
        trg_dict = {}
        for i, word in enumerate(srctext.split()):
            src_dict[i] = word
        for i, word in enumerate(trgtext.split()):
            trg_dict[i] = word
        f_aligned = set()
        e_aligned = set()
        line_lst = alignment_output_line.split()
        alignment = list()
        for pair in line_lst:
            [f, e] = pair.split("-")
            f_aligned.add(int(f))
            e_aligned.add(int(e))
            alignment.append((int(f),int(e)))
        alignment = sorted(alignment)

        phrase_set = set()
        phrase = set()
        prev_e = 0
        prev_f = 0
        for f,e in alignment:
            if abs(e - prev_e) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            elif abs(f - prev_f) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            else:
                phrase.add((e,f))
            prev_e = e
            prev_f = f
        phrase_set.add(frozenset(phrase))

        full_phrases = []
        for i in sorted(phrase_set):
            if len(i) > 0:
                phrase_e = set()
                phrase_f = set()
                for pair in sorted(i):
                    phrase_e.add((pair[0],trg_dict[pair[0]]))
                    phrase_f.add((pair[1],src_dict[pair[1]]))
                phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
                full_phrases.append(phrase_pair)
        unaligned_phrases = []
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[0][0])):
            #print(i,phrase_pair[0],phrase_pair[0][-1][0],phrase_pair[1])
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[0][-1][0] != phrase_pair[0][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[0][-1][0]+1,phrase_pair[0][0][0]):
                        unaligned_sequence.append((k,trgtext_lst[k]))
                    unaligned_phrases.append([unaligned_sequence,None])
                prev_pair = phrase_pair
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[1][0])):
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[1][-1][0] != phrase_pair[1][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[1][-1][0]+1,phrase_pair[1][0][0]):
                        unaligned_sequence.append((k,srctext_lst[k]))
                    unaligned_phrases.append([None,unaligned_sequence])
                prev_pair = phrase_pair
        full_phrases.extend(unaligned_phrases)
        
        full_phrase_set = set()
        for i in full_phrases:
            new_e = tuple(i[0]) if i[0] else None
            new_f = tuple(i[1]) if i[1] else None
            new_phrase = (new_e, new_f)
            full_phrase_set.add(new_phrase)
            
        return full_phrase_set
    
    phrase_counts = Counter()
    phrase_table = {}
    src_align_dict = {}
    trg_align_dict = {}
    unigram_e_len = Counter()
    unigram_f_len = Counter()
    n_e_set = set()
    n_f_set = set()
    with open(srctext_file,"r",encoding="utf-8") as srctext_f, \
    open(trgtext_file,"r",encoding="utf-8") as trgtext_f, \
    open(alignments_file,"r") as alignments_f:
        for i, (sline,tline,aline) in enumerate(zip(srctext_f,trgtext_f,alignments_f)):
            src = sline.strip("\n\r")
            for word in src.split():
                n_f_set.add(word)
            trg = tline.strip("\n\r")
            for word in trg.split():
                n_e_set.add(word)
            A = aline.strip("\n\r")
            full_phrases = extract_phrases(src,trg,A)
            phrase_table[i] = full_phrases
            for phrase_pair in full_phrases:
                if len(phrase_pair) == 2:
                    trg_phrase_lst = []
                    src_phrase_lst = []
                    if phrase_pair[0]:
                        target = phrase_pair[0]
                        #print("target =", target, type(target))
                        unigram_e_len[target] += len(target)
                        for ind,word in target:
                            trg_phrase_lst.append(word)
                        trg_phrase = " ".join(trg_phrase_lst)
                    else:
                        trg_phrase = None
                    if phrase_pair[1]:
                        source = phrase_pair[1]
                        #print("source =", source, type(source), len(source))
                        unigram_f_len[source] += len(source)
                        for ind,word in source:
                            src_phrase_lst.append(word)
                        src_phrase = " ".join(src_phrase_lst)
                    else:
                        src_phrase = None
                    phrase_counts[(trg_phrase,src_phrase)] += 1
            src_align_dict[i] = dd(set)
            trg_align_dict[i] = dd(set)
            for align_pair in A.split():
                pair = align_pair.split("-")
                src_ind = int(pair[0])
                trg_ind = int(pair[1])
                src_align_dict[i][src_ind].add(trg_ind)
                trg_align_dict[i][trg_ind].add(src_ind)
        n_e = len(n_e_set)
        n_f = len(n_f_set)
        
    return (phrase_counts, phrase_table, src_align_dict, trg_align_dict, unigram_e_len, unigram_f_len, n_e, n_f)

In [4]:
"""
def extract_phrases(srctext,trgtext,alignment_output_line):
        """
        #WORKING: List Version!
        #Extracts phrases from the alignment output for a single sentence.
        #For unaligned phrases, creates a phrase pair where null side is None.
        """
        srclen = len(srctext.split())
        trglen = len(trgtext.split())
        srctext_lst = srctext.split()
        trgtext_lst = trgtext.split()
        src_dict = {}
        trg_dict = {}
        for i, word in enumerate(srctext.split()):
            src_dict[i] = word
        for i, word in enumerate(trgtext.split()):
            trg_dict[i] = word
        f_aligned = set()
        e_aligned = set()
        line_lst = alignment_output_line.split()
        alignment = list()
        for pair in line_lst:
            [f, e] = pair.split("-")
            f_aligned.add(int(f))
            e_aligned.add(int(e))
            alignment.append((int(f),int(e)))
        alignment = sorted(alignment)

        phrase_set = set()
        phrase = set()
        prev_e = 0
        prev_f = 0
        for f,e in alignment:
            if abs(e - prev_e) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            elif abs(f - prev_f) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            else:
                phrase.add((e,f))
            prev_e = e
            prev_f = f
        phrase_set.add(frozenset(phrase))

        full_phrases = []
        for i in sorted(phrase_set):
            if len(i) > 0:
                phrase_e = set()
                phrase_f = set()
                for pair in sorted(i):
                    phrase_e.add((pair[0],trg_dict[pair[0]]))
                    phrase_f.add((pair[1],src_dict[pair[1]]))
                phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
                full_phrases.append(phrase_pair)
        unaligned_phrases = []
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[0][0])):
            #print(i,phrase_pair[0],phrase_pair[0][-1][0],phrase_pair[1])
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[0][-1][0] != phrase_pair[0][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[0][-1][0]+1,phrase_pair[0][0][0]):
                        unaligned_sequence.append((k,trgtext_lst[k]))
                    unaligned_phrases.append([unaligned_sequence,None])
                prev_pair = phrase_pair
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[1][0])):
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[1][-1][0] != phrase_pair[1][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[1][-1][0]+1,phrase_pair[1][0][0]):
                        unaligned_sequence.append((k,srctext_lst[k]))
                    unaligned_phrases.append([None,unaligned_sequence])
                prev_pair = phrase_pair
        full_phrases.extend(unaligned_phrases)

        return full_phrases
"""

IndentationError: unexpected indent (<ipython-input-4-598f9ccf5435>, line 7)

In [7]:
# QUICK TESTER CODE just for Extract function, now private within 
# initialize_phrases_from_text function

def extract_phrases_test(srctext,trgtext,alignment_output_line):
        """
        Extracts phrases from the alignment output for a single sentence.
        For unaligned phrases, creates a phrase pair where null side is None.
        """
        srclen = len(srctext.split())
        trglen = len(trgtext.split())
        srctext_lst = srctext.split()
        trgtext_lst = trgtext.split()
        src_dict = {}
        trg_dict = {}
        for i, word in enumerate(srctext.split()):
            src_dict[i] = word
        for i, word in enumerate(trgtext.split()):
            trg_dict[i] = word
        f_aligned = set()
        e_aligned = set()
        line_lst = alignment_output_line.split()
        alignment = list()
        for pair in line_lst:
            [f, e] = pair.split("-")
            f_aligned.add(int(f))
            e_aligned.add(int(e))
            alignment.append((int(f),int(e)))
        alignment = sorted(alignment)

        phrase_set = set()
        phrase = set()
        prev_e = 0
        prev_f = 0
        for f,e in alignment:
            if abs(e - prev_e) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            elif abs(f - prev_f) > 1:
                phrase_set.add(frozenset(phrase))
                phrase = set([(e,f)])
            else:
                phrase.add((e,f))
            prev_e = e
            prev_f = f
        phrase_set.add(frozenset(phrase))

        full_phrases = []
        for i in sorted(phrase_set):
            if len(i) > 0:
                phrase_e = set()
                phrase_f = set()
                for pair in sorted(i):
                    phrase_e.add((pair[0],trg_dict[pair[0]]))
                    phrase_f.add((pair[1],src_dict[pair[1]]))
                phrase_pair = [sorted(phrase_e),sorted(phrase_f)]
                full_phrases.append(phrase_pair)
        unaligned_phrases = []
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[0][0])):
            #print(i,phrase_pair[0],phrase_pair[0][-1][0],phrase_pair[1])
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[0][-1][0] != phrase_pair[0][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[0][-1][0]+1,phrase_pair[0][0][0]):
                        unaligned_sequence.append((k,trgtext_lst[k]))
                    unaligned_phrases.append([unaligned_sequence,None])
                prev_pair = phrase_pair
        for i, phrase_pair in enumerate(sorted(full_phrases,key=lambda x:x[1][0])):
            if i == 0:
                prev_pair = phrase_pair
                continue
            else:
                if prev_pair[1][-1][0] != phrase_pair[1][0][0]-1:
                    unaligned_sequence = []
                    for k in range(prev_pair[1][-1][0]+1,phrase_pair[1][0][0]):
                        unaligned_sequence.append((k,srctext_lst[k]))
                    unaligned_phrases.append([None,unaligned_sequence])
                prev_pair = phrase_pair
        full_phrases.extend(unaligned_phrases)
        
        full_phrase_set = set()
        for i in full_phrases:
            new_e = tuple(i[0]) if i[0] else None
            new_f = tuple(i[1]) if i[1] else None
            new_phrase = (new_e, new_f)
            full_phrase_set.add(new_phrase)
            
        return full_phrase_set

srctext = "Michael geht davon aus , dass er im haus bleibt"
trgtext = "Michael assumes that he will stay in the house"
alignment_output_line = "0-0 1-1 2-1 3-1 5-2 6-3 7-6 7-7 8-8 9-4 9-5"

full_phrase_set = extract_phrases_test(srctext,trgtext,alignment_output_line)
for i in full_phrase_set:
    print(i)

(((0, 'Michael'), (1, 'assumes')), ((0, 'Michael'), (1, 'geht'), (2, 'davon'), (3, 'aus')))
(((4, 'will'), (5, 'stay')), ((9, 'bleibt'),))
(((2, 'that'), (3, 'he')), ((5, 'dass'), (6, 'er')))
(None, ((4, ','),))
(((6, 'in'), (7, 'the'), (8, 'house')), ((7, 'im'), (8, 'haus')))


In [12]:
print(len((None, ((4, ','),))[0]))

TypeError: object of type 'NoneType' has no len()

In [None]:
# Supporting functions:
def p_f(src_phrase,n_f):
    return (p_s*(1-p_s)**(len(src_phrase)))*((1/n_f)**(len(src_phrase)))

def p_e(trg_phrase,n_e):
    return (p_s*(1-p_s)**(len(trg_phrase)))*((1/n_e)**(len(trg_phrase)))

def calc_P_WA_trg_src(sent_ind, trg_phrase, sline):
    """
    Expects both phrase arguments to be a list of 2-tuples, e.g.
    if looking for P_WA(e|f), as in this function, trg_phrase is [(0,"Michael"),(1,"assumes")]
    and sline is the source sentence in the form of a list: ["Michael", "geht", "davon", "aus", ...]
    Should use dynamic programming to store calculations of phrase probabilities
    based on IBM Model 1 word alignment probabilities.
    """
    for i, pair in enumerate(trg_phrase):
        if word_ind in trg_align_dict[sent_ind][pair[0]]:
            for word_ind in trg_align_dict[sent_ind][pair[0]]:
                if i == 0:
                    if pair[1] in P_WA:
                        if sline[word_ind] in P_WA[pair[1]]:
                            P_WA_value = P_WA[pair[1]][sline[word_ind]]
                        else:
                            P_WA_value = 0
                    else:
                        P_WA_value = 0
                else:
                    if pair[1] in P_WA:
                        if sline[word_ind] in P_WA[pair[1]]:
                            P_WA_value *= P_WA[pair[1]][sline[word_ind]]
                        else:
                            P_WA_value *= 0
                    else:
                        P_WA_value *= 0
        else:
            P_WA_value = 0
    return P_WA_value

def calc_P_WA_src_trg(sent_ind, src_phrase, tline):
    """
    Expects both phrase arguments to be a list of 2-tuples, e.g.
    if looking for P_WA(f|e), as in this function, src_phrase is [(0,"Michael"),(1,"geht"),(2,"davon"),(3,"aus")]
    and tline is the target sentence in the form of a list: ["Michael", "assumes", ...]
    Should use dynamic programming to store calculations of phrase probabilities
    based on IBM Model 1 word alignment probabilities. 
    """
    for i, pair in enumerate(src_phrase):
        if word_ind in src_align_dict[sent_ind][pair[0]]:
            for word_ind in src_align_dict[sent_ind][pair[0]]:
                if i == 0:
                    if pair[1] in P_WA:
                        if tline[word_ind] in P_WA[pair[1]]:
                            P_WA_value = P_WA[pair[1]][tline[word_ind]]
                        else:
                            P_WA_value = 0
                    else:
                        P_WA_value = 0
                else:
                    if pair[1] in P_WA:
                        if tline[word_ind] in P_WA[pair[1]]:
                            P_WA_value *= P_WA[pair[1]][tline[word_ind]]
                        else:
                            P_WA_value *= 0
                    else:
                        P_WA_value *= 0
        else:
            P_WA_value = 0
    return P_WA_value

def delta(trg_phrase,src_phrase,tline,sline):
    """

    """
    s = float(len(tline))/float(len(sline))
    delta = b**abs(trg_phrase[0][0]-(src_phrase[0][0]*s))
    return delta

def tau(trg_phrase,src_phrase,phrase_counts):
    if src_phrase and trg_phrase:
        tau_value = (phrase_counts[(trg_phrase,src_phrase)] + \
                     (alpha * ((p_f(src_phrase,n_f) * \
                              calc_P_WA_trg_src(sent_ind, trg_phrase, sline)) * \
                             (p_e(trg_phrase,n_e) * \
                             calc_P_WA_src_trg(sent_ind, src_phrase, tline))))) / \
        (len(phrase_counts) + alpha)
    else:
        tau_value = 0
    return tau_value

def theta_N(trg_phrase,src_phrase):
    if trg_phrase and src_phrase:
        theta_N_value = 0
    elif trg_phrase and not src_phrase:
        theta_N_value = 0.5*p_e(trg_phrase,n_e)
    elif src_phrase and not trg_phrase:
        theta_N_value = 0.5*p_f(src_phrase,n_f)
    return theta_N_value

In [None]:
def objective(src_phrase,trg_phrase,phrase_counts,tline,sline,sent_ind):
    """
    This calculates the objective function's value.
    Watch out! Need to be able to modify to accommodate Markov blanket...
    """
    if src_phrase and trg_phrase:
        obj_value = (1-p_S)*(1-p_phi)*tau(trg_phrase,src_phrase,phrase_counts)*delta(trg_phrase,src_phrase,tline,sline)
    else:
        obj_value = (1-p_S)*p_phi*theta_N(trg_phrase,src_phrase)
        
    return obj_value

In [None]:
#def model(L,):
#    """
#    INCOMPLETE! Need to bring in phrases as input, and these
#    will come from knowing sent_ind, sline, tline.
#    L is the number of phrases in a sentence.
#    """
#    for i, phrase in enumerate(phrase_pairs):
#        delta(trg_phrase,src_phrase,tline,sline)
#    first_term = p_S*((1-p_S)^(L-1))

In [None]:
np.random.choice(5,1,p=[0,0.5,0.5,0,0])[0]

In [None]:
def SWAP(src_ph_1,trg_ph_1,src_ph_2,trg_ph_2,phrase_counts,phrase_table):
    """
    
    """
    # initialize scorekeeping dict
    pair_score = dict()
    # Set up configurations as tuples. Names of variables will be keys in dict.
    ident = (src_ph_1,trg_ph_1,src_ph_2,trg_ph_2)
    swapped = (src_ph_2,trg_ph_1,src_ph_1,trg_ph_2)
    # First, modifying counts to take into account the Markov blanket.
    phrase_counts[(src_ph_1,trg_ph_1)] -= 1
    phrase_counts[(src_ph_2,trg_ph_2)] -= 1
    # Do I need to decrement the counts for the swapped configuration?
    # Second, calculate objective scores.
    #     First, swapped configuration: increment phrase counts for it, 
    #     then decrement them to restore to Markov blanket
    phrase_counts[(src_ph_2,trg_ph_1)] += 1
    phrase_counts[(src_ph_1,trg_ph_2)] += 1
    pair_score["swapped"] = (objective(src_ph_2,trg_ph_1,phrase_counts,tline,sline,sent_ind),
                             objective(src_ph_1,trg_ph_2,phrase_counts,tline,sline,sent_ind))
    phrase_counts[(src_ph_2,trg_ph_1)] -= 1
    phrase_counts[(src_ph_1,trg_ph_2)] -= 1
    #     Second, identify configuration: increment phrase_counts for identity, 
    #     then decrement them to restore to Markov blanket
    phrase_counts[(src_ph_1,trg_ph_1)] += 1
    phrase_counts[(src_ph_2,trg_ph_2)] += 1
    pair_score["ident"] = (objective(src_ph_1,trg_ph_1,phrase_counts,tline,sline,sent_ind),
                           objective(src_ph_2,trg_ph_2,phrase_counts,tline,sline,sent_ind))
    phrase_counts[(src_ph_1,trg_ph_1)] -= 1
    phrase_counts[(src_ph_2,trg_ph_2)] -= 1
    # calculate output configuration probabilities
    ident_prob = (pair_score["ident"][0]*pair_score["ident"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
    swapped_prob = (pair_score["swapped"][0]*pair_score["swapped"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
    # choose one configuration probabilistically
    out_conf = np.random.choice([ident,swapped],p=[ident_prob,swapped_prob])[0]
    # update phrase_counts, phrase_table depending on out_conf, which is a tuple of (f,e,f,e) phrases
    phrase_counts[(out_conf[1],out_conf[0])] += 1
    phrase_counts[(out_conf[3],out_conf[2])] += 1
    # update phrase_table
    pt = phrase_table[sent_id]
    pt.remove((src_ph_1,trg_ph_1))
    pt.remove((src_ph_2,trg_ph_2))
    pt.add((out_conf[0],out_conf[1]))
    pt.add((out_conf[2],out_conf[3]))
    
    return (phrase_counts, phrase_table)

In [5]:
def collapsed_gibbs_aligner(srctext_file,trgtext_file,phrase_table,phrase_counts):
    """
    This function assumes that relevant hyperparameters are set as 
    global variables with the appropriate names. This can be fixed
    if necessary (or preferred)!
    Will want to repeatedly call this for N iterations.
    """
    # Supporting functions:
    def p_f(src_phrase,n_f):
        return (p_s*(1-p_s)**(len(src_phrase)))*((1/n_f)**(len(src_phrase)))
    
    def p_e(trg_phrase,n_e):
        return (p_s*(1-p_s)**(len(trg_phrase)))*((1/n_e)**(len(trg_phrase)))
    
    def calc_P_WA_trg_src(sent_ind, trg_phrase, sline):
        """
        Expects both phrase arguments to be a list of 2-tuples, e.g.
        if looking for P_WA(e|f), as in this function, trg_phrase is [(0,"Michael"),(1,"assumes")]
        and sline is the source sentence in the form of a list: ["Michael", "geht", "davon", "aus", ...]
        Should use dynamic programming to store calculations of phrase probabilities
        based on IBM Model 1 word alignment probabilities.
        """
        for i, pair in enumerate(trg_phrase):
            # trg_phrase = ((0, 'This'), (1, 'is'))
            # i = 0, pair = (0, 'This')
            # i = 1, pair = (1, 'is')
            if i in trg_align_dict[sent_ind][pair[0]]:
                for word_ind in trg_align_dict[sent_ind][pair[0]]:
                    if i == 0:
                        if pair[1] in P_WA:
                            if sline[word_ind] in P_WA[pair[1]]:
                                P_WA_value = P_WA[pair[1]][sline[word_ind]]
                            else:
                                P_WA_value = lowest_prob
                        else:
                            P_WA_value = lowest_prob
                    else:
                        if pair[1] in P_WA:
                            if sline[word_ind] in P_WA[pair[1]]:
                                P_WA_value *= P_WA[pair[1]][sline[word_ind]]
                            else:
                                P_WA_value *= lowest_prob
                        else:
                            P_WA_value *= lowest_prob
            else:
                P_WA_value = lowest_prob
        return P_WA_value
    
    def calc_P_WA_src_trg(sent_ind, src_phrase, tline):
        """
        Expects both phrase arguments to be a list of 2-tuples, e.g.
        if looking for P_WA(f|e), as in this function, src_phrase is [(0,"Michael"),(1,"geht"),(2,"davon"),(3,"aus")]
        and tline is the target sentence in the form of a list: ["Michael", "assumes", ...]
        Should use dynamic programming to store calculations of phrase probabilities
        based on IBM Model 1 word alignment probabilities. 
        """
        for i, pair in enumerate(src_phrase):
            if i in src_align_dict[sent_ind][pair[0]]:
                for word_ind in src_align_dict[sent_ind][pair[0]]:
                    if i == 0:
                        if pair[1] in P_WA:
                            if tline[word_ind] in P_WA[pair[1]]:
                                P_WA_value = P_WA[pair[1]][tline[word_ind]]
                            else:
                                P_WA_value = lowest_prob
                        else:
                            P_WA_value = lowest_prob
                    else:
                        if pair[1] in P_WA:
                            if tline[word_ind] in P_WA[pair[1]]:
                                P_WA_value *= P_WA[pair[1]][tline[word_ind]]
                            else:
                                P_WA_value *= lowest_prob
                        else:
                            P_WA_value *= lowest_prob
            else:
                P_WA_value = lowest_prob
        return P_WA_value
    
    def delta(trg_phrase,src_phrase,tline,sline):
        """

        """
        s = float(len(tline))/float(len(sline))
        delta = b**abs(trg_phrase[0][0]-(src_phrase[0][0]*s))
        return delta
    
    def tau(trg_phrase,src_phrase,phrase_counts):
        tau_value = (phrase_counts[(trg_phrase,src_phrase)] + \
                     (alpha * ((p_f(src_phrase,n_f) * \
                              calc_P_WA_trg_src(sent_ind, trg_phrase, sline)) * \
                             (p_e(trg_phrase,n_e) * \
                             calc_P_WA_src_trg(sent_ind, src_phrase, tline))))) / \
        (len(phrase_counts) + alpha)
        return tau_value
    
    def theta_N(trg_phrase,src_phrase):
        if trg_phrase and not src_phrase:
            theta_N_value = 0.5*p_e(trg_phrase,n_e)
        elif src_phrase and not trg_phrase:
            theta_N_value = 0.5*p_f(src_phrase,n_f)
        return theta_N_value
    
    def objective(trg_phrase,src_phrase,phrase_counts,tline,sline,sent_ind):
        """
        This calculates the potential function's value.
        Watch out! Need to be able to modify to accommodate Markov blanket...
        """
        if src_phrase and trg_phrase:
            obj_value = (1-p_S)*(1-p_phi)*tau(trg_phrase,src_phrase,phrase_counts)*delta(trg_phrase,src_phrase,tline,sline)
        else:
            obj_value = (1-p_S)*p_phi*theta_N(trg_phrase,src_phrase)

        return obj_value
    
    def create_phrase_map(sline,tline,full_phrases):
        """
        full_phrases is the value of a key in the dictionary phrase_table, e.g. phrase_tables[0]
        to get the full_phrases of the first sentence.
        """
        src_phrase_map = {}
        trg_phrase_map = {}
        for phrase_ind, pair in enumerate(full_phrases):
            trg = pair[0]
            src = pair[1]
            if trg:
                for entry in trg:
                    trg_phrase_map[entry[0]] = trg
            if src:
                for entry in src:
                    src_phrase_map[entry[0]] = src
        for src_ind, word in enumerate(sline):
            if src_ind not in src_phrase_map:
                src_phrase_map[src_ind] = None
        for trg_ind, word in enumerate(tline):
            if trg_ind not in trg_phrase_map:
                trg_phrase_map[trg_ind] = None
        return (src_phrase_map, trg_phrase_map)
    
    def SWAP(src_ph_1,trg_ph_1,src_ph_2,trg_ph_2,phrase_counts,phrase_table):
        """

        """
        # initialize scorekeeping dict
        pair_score = dict()
        # Set up configurations as tuples. Names of variables will be keys in dict.
        ident = (src_ph_1,trg_ph_1,src_ph_2,trg_ph_2)
        swapped = (src_ph_2,trg_ph_1,src_ph_1,trg_ph_2)
        out_conf_dict = {"ident":ident,"swapped":swapped}
        # First, modifying counts to take into account the Markov blanket.
        phrase_counts[(src_ph_1,trg_ph_1)] -= 1
        phrase_counts[(src_ph_2,trg_ph_2)] -= 1
        # Do I need to decrement the counts for the swapped configuration?
        # Second, calculate objective scores.
        #     First, swapped configuration: increment phrase counts for it, 
        #     then decrement them to restore to Markov blanket
        phrase_counts[(src_ph_2,trg_ph_1)] += 1
        phrase_counts[(src_ph_1,trg_ph_2)] += 1
        pair_score["swapped"] = (objective(trg_ph_1,src_ph_2,phrase_counts,tline,sline,sent_ind),
                                 objective(trg_ph_2,src_ph_1,phrase_counts,tline,sline,sent_ind))
        phrase_counts[(src_ph_2,trg_ph_1)] -= 1
        phrase_counts[(src_ph_1,trg_ph_2)] -= 1
        #     Second, identify configuration: increment phrase_counts for identity, 
        #     then decrement them to restore to Markov blanket
        phrase_counts[(src_ph_1,trg_ph_1)] += 1
        phrase_counts[(src_ph_2,trg_ph_2)] += 1
        pair_score["ident"] = (objective(trg_ph_1,src_ph_1,phrase_counts,tline,sline,sent_ind),
                               objective(trg_ph_2,src_ph_2,phrase_counts,tline,sline,sent_ind))
        phrase_counts[(src_ph_1,trg_ph_1)] -= 1
        phrase_counts[(src_ph_2,trg_ph_2)] -= 1
        # calculate output configuration probabilities
        ident_prob = (pair_score["ident"][0]*pair_score["ident"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        swapped_prob = (pair_score["swapped"][0]*pair_score["swapped"][1])/((pair_score["ident"][0]*pair_score["ident"][1])+(pair_score["swapped"][0]*pair_score["swapped"][1]))
        # choose one configuration probabilistically
        out_conf_decision = np.random.choice(["ident","swapped"],1,p=[ident_prob,swapped_prob])[0]
        out_conf = out_conf_dict[out_conf_decision]
        # update phrase_counts, phrase_table depending on out_conf, which is a tuple of (f,e,f,e) phrases
        phrase_counts[(out_conf[1],out_conf[0])] += 1
        phrase_counts[(out_conf[3],out_conf[2])] += 1
        # update phrase_table
        pt = phrase_table[sent_ind]
#        for i in pt:
#            print(i)
        pt.remove((trg_ph_1,src_ph_1))
        pt.remove((trg_ph_2,src_ph_2))
        pt.add((out_conf[1],out_conf[0]))
        pt.add((out_conf[3],out_conf[2]))
        phrase_table[sent_ind] = pt
        
        return (phrase_counts, phrase_table)
    
    def FLIP():
        pass
    
    def TOGGLE(src_ph,trg_ph,phrase_counts,phrase_table):
        pair_score = dict()
        prob_score[] = dict()
        if src_ph_1 != None and trg_ph_1 != None:
            delinked = ((trg_ph_1,None),(None,src_ph_1))
            ident = (trg_ph_1,src_ph_1)
            # Take out phrase_count for this pair to accommodate markov blanket
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            # see what effect delinking (i.e. "unaligning") them has
            pair_score["delinked"] = objective(trg_ph_1,None,phrase_counts,tline,sline,sent_ind)*objective(None,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # Augment phrase_counts to capture "new" pairing (which was originally there, in this case)
            phrase_counts((trg_ph_1,src_ph_1)) += 1
            pair_score["ident"] = objective(trg_ph_1,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # Restore Markov blanket, and await decision as to shape this should take below.
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            prob_score["ident"] = pair_score["ident"]/(pair_score["ident"]+pair_score["delinked"])
            prob_score["delinked"] = pair_score["delinked"]/(pair_score["ident"]+pair_score["delinked"])
            out_conf_decision = np.random.choice(["ident","delinked"],1,p=[prob_score["ident"],prob_score["delinked"]])
            if out_conf_decision == "ident":
                phrase_counts((trg_ph_1,src_ph_1)) += 1
            elif out_conf_decision == "delinked":
                phrase_table[sent_ind].remove((trg_ph_1,src_ph_1))
        elif src_ph_1 == None and trg_ph_1 == None:
            # create Markov blanket
            phrase_counts((trg_ph_1,src_ph_1)) -= 1
            # see what effect aligning them has
            aligned = (trg_ph_1,src_ph_1)
            pair_score["delinked"] = objective(trg_ph_1,src_ph_1,phrase_counts,tline,sline,sent_ind)
            # START WORKING HERE ON 1/13
        else:
            pass
        
        return (phrase_counts, phrase_table)
    
    
    
    prob_table = {}   # keys are sent_ind, values are final probabilities.
    with open(srctext_file,"r") as srctext_f, \
    open(trgtext_file,"r") as trgtext_f:
        for sent_ind, (src_line, trg_line) in enumerate(zip(srctext_f,trgtext_f)):
            sline = src_line.strip("\n\r").split()
            tline = trg_line.strip("\n\r").split()
            # Need to loop through indices of both source and target sentences
            # since their lengths are likely not equal.
#            initial_probability = 0   # PLACEHOLDER - fill with model calculation. Not clear this is needed...
            # Need to write a function for here that maps each index in the english sentence 
            # and in the foreign sentence to the phrase in which it is included.
            (src_phrase_map, trg_phrase_map) = create_phrase_map(sline,tline,phrase_table[sent_ind])
            # SPEED ISSUE would probably be here because of nested for-loop.
            src_prev_phrase = ""
            trg_prev_phrase = ""
            for src_ind, src_word in enumerate(sline):
                for trg_ind, trg_word in enumerate(tline):
                    # Apply SWAP, if applicable - could remove some of these for-loops with a dictionary 
                    # of phrase alignments.
                    # CHECK TO ENSURE THAT WE'RE NOT MISSING ANY PHRASES IN EITHER LANGUAGE!
                    src_ph_1 = src_phrase_map[src_ind]
                    for pair_1 in phrase_table[sent_ind]:
                        if pair_1[1] == src_ph_1:
                            trg_ph_1 = pair_1[0]
                        else:
                            trg_ph_1 = None
                    # We do this if-loop to make sure that we're not unnecessarily recalculating
                    # SWAP when we're changing sentence index, but not actually phrase boundaries
                    if src_phrase_map[src_ind] != src_prev_phrase and trg_phrase_map[trg_ind] != trg_prev_phrase:
                        for ind in range(src_ind,len(sline)):
                            if src_phrase_map[ind] != src_ph_1:
                                src_ph_2 = src_phrase_map[ind]
                                for pair_2 in phrase_table[sent_ind]:
                                    if pair_2[1] == src_ph_2:
                                        trg_ph_2 = pair_2[0]
                                break
                        if src_ph_2 and trg_ph_2:
                            # Actual call to SWAP
                            phrase_counts, phrase_table = SWAP(src_ph_1,trg_ph_1,src_ph_2,trg_ph_2,phrase_counts,phrase_table)
                            src_prev_phrase = src_ph_1
                            trg_prev_phrase = trg_ph_1
                            src_ph_2 = None
                            trg_ph_2 = None
                    # Apply TOGGLE, if applicable
                    #TOGGLE()
                    # Apply FLIP, if applicable
                    
                    
                    # Apply FLIP TWO, if applicable
                    
                    # Apply MOVE, if applicable
                    
#            final_probability = 0   # PLACEHOLDER - fill with model calculation.
    return phrase_table

SyntaxError: invalid syntax (<ipython-input-5-7f97f8903d34>, line 199)

In [174]:
if __name__ == "__main__":
    # Data Structures
    # Hyperparameters - safe to set as global variables since no function should change these on a given run.
    p_S = 0.1    # p_$
    b = 0.85
    p_phi = 10e-10
    alpha = 100
    p_s = 0.8    # p_s -- note difference from p_S = p_$ = 0.1
    
    # First, initialize the model.
    src_vocab_file = "../data/example/two_line/f-srctext-spa.vcb"
    trg_vocab_file = "../data/example/two_line/e-trgtext-eng.vcb"
    src_trg_alignment = "../data/example/two_line/f_e_word_alignment_probs.txt"
    trg_src_alignment = "../data/example/two_line/e_f_word_alignment_probs.txt"
    srctext_file = "../data/example/two_line/f-srctext-spa.txt"
    trgtext_file = "../data/example/two_line/e-trgtext-eng.txt"
    alignments_file = "../data/example/two_line/berkeley_alignments_spa_eng.txt"
    
    P_WA, lowest_prob = model_one_processing(src_vocab_file,trg_vocab_file,src_trg_alignment,trg_src_alignment)
    lowest_prob *= 0.1
    phrase_counts, phrase_table, src_align_dict, trg_align_dict, unigram_e_len, unigram_f_len, n_e, n_f = initialize_phrases_from_text(srctext_file,trgtext_file,alignments_file)
    orig_phrase_table = copy.deepcopy(phrase_table)    # DEBUGGING
    phrase_table = collapsed_gibbs_aligner(srctext_file,trgtext_file,phrase_table,phrase_counts)
    

0.5341612078038177 0.46583879219618224 1.0
0.43199502836797 0.5680049716320299 0.9999999999999999
0.5299019725920529 0.4700980274079471 1.0
0.5595908179171032 0.4404091820828968 1.0
0.5085527935361984 0.49144720646380163 1.0
0.679770921747692 0.32022907825230806 1.0
1.0 2.731865482750061e-19 1.0
1.0 1.1585706763125876e-23 1.0
0.6053198708907699 0.39468012910923006 1.0
0.5469066148801611 0.4530933851198389 1.0
0.5929947898610891 0.407005210138911 1.0
0.5837141848077462 0.41628581519225377 1.0
0.5421546145467243 0.45784538545327574 1.0
0.45784538545327574 0.5421546145467242 1.0
0.4578453854532757 0.5421546145467243 1.0
0.4578453854532758 0.5421546145467242 1.0
1.0 3.186585618285712e-19 1.0
1.0 1.2661915423640027e-32 1.0
1.0 3.230072617011702e-28 1.0
0.5837141848077463 0.41628581519225377 1.0
