# Extract Noise Embedding

## Get the Longest Common Subsequence

In [20]:
def LCS(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]

    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    # The very first element stores the LCS for the 2 full sequences, building up its value in the bottom up approach
    return dp[0][0]

In [21]:
LCS('abcde','ace')

3

In [28]:
def LCS_return_seq(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if seq1[i]==seq2[j]:
            lcs.append(seq1[i])
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

In [29]:
LCS_return_seq('abcde','ace')

['a', 'c', 'e']

## Pad 2 sequences

In [32]:
def force_align(seq1,seq2):
    # Get the lcs between the 2 sequences
    lcs = LCS_return_seq(seq1, seq2)

    seq1_aligned = []
    seq2_aligned = []

    i = 0
    j = 0

    padding = "Pad"
    next_x = False

    """
    Big Idea: 
    - align the lcs tokens
    - for the out-of-lcs token in one sequence, align it with a padding token in the other sequence 
      to denote token level noise
    """
    for x in lcs:
        next_x = False
        while not next_x:

            # Case 1: seq1[i]==seq2[j]==x
            # Action
            # - append seq[i] to seq1_aligned
            # - append seq[j] to seq2_aligned
            # - i+1
            # - j+1
            if seq1[i]==x and seq2[j]==x:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(seq2[j])
                i+=1
                j+=1
                # Move to the next x in lcs
                next_x = True

            # Case 2: seq1[i]==x but seq[2]!=x
            # Action
            # - append padding to seq1_aligned to match the out-of-lcs token from seq2
            # - append out-of-lcs token seq2[j] to seq2_aligned
            # - j+1 to simulate that the j-th token has been matched by the padding token in seq1
            elif seq1[i]==x and seq2[j]!=x:
                seq1_aligned.append(padding)
                seq2_aligned.append(seq2[j])
                j+=1

            # Case 3 and 4: seq1[i]!=x but seq[2]==x as well as seq1[i]!=x but seq[2]!=x
            # Action
            # - append out-of-lcs token seq1[i] to seq1_aligned
            # - append padding to seq2_aligned to match the out-of-lcs token from seq1
            # - i+1 to simulate that the i-th token has been matched by the padding token in seq2
            # For the case where both don't match, we use the same logic. It'll result
            # in matched padding to out-of-lcs tokens in both seqs
            else:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(padding)
                i+=1

    # Once all the lcs tokens have been aligned
    # for the out-of-lcs token in one sequence, align it with a padding token in the other sequence
    while i < len(seq1):
        seq1_aligned.append(seq1[i])
        seq2_aligned.append(padding)
        i+=1

    while j < len(seq2):
        seq1_aligned.append(padding)
        seq2_aligned.append(seq2[j])
        j+=1

    return seq1_aligned, seq2_aligned


In [33]:
# Case where one col doesn't match
seq1 = ['a','j','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

**Given example**

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```

In [38]:
# 1 and 2
seq1 = ['a','b','c','d','e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']


In [40]:
# 1 and random case
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','g','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'Pad', 'c', 'd', 'e']
['a', 'z', 'b', 'g', 'c', 'Pad', 'e']


In [43]:
# 1 and 5
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']


In [44]:
# 1 (after 5) and 2
seq1 = ['a', 'Pad', 'b', 'c', 'd', 'e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']


## Pad n sequences. Example 5 sequences

Big Idea: 

Force align seq1 and seq2 and get new seq1

Force align seq1 and seq3 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align

Force align seq1 and seq4 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align
- insert in seq3 to re-align

Force align seq1 and seq5 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align
- insert in seq3 to re-align
- insert in seq4 to re-align


In [55]:
def force_align(seq1,seq2):
    # Let seq1 be the "GT" sequence
    # Get the lcs between the 2 sequences
    lcs = LCS_return_seq(seq1, seq2)

    seq1_aligned = []
    seq2_aligned = []

    i = 0
    j = 0

    padding = "Pad"
    next_x = False

    # Initialise a list to hold the indexes where 
    # the newly padding tokens are inserted
    seq1_pad_indexes = []

    """
    Big Idea: 
    - align the lcs tokens
    - for the out-of-lcs token in one sequence, align it with a padding token in the other sequence 
      to denote token level noise
    """
    for x in lcs:
        next_x = False
        while not next_x:

            # Case 1: seq1[i]==seq2[j]==x
            # Action
            # - append seq[i] to seq1_aligned
            # - append seq[j] to seq2_aligned
            # - i+1
            # - j+1
            if seq1[i]==x and seq2[j]==x:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(seq2[j])
                i+=1
                j+=1
                # Move to the next x in lcs
                next_x = True

            # Case 2: seq1[i]==x but seq[2]!=x
            # Action
            # - append padding to seq1_aligned to match the out-of-lcs token from seq2 
            # - append out-of-lcs token seq2[j] to seq2_aligned
            # - j+1 to simulate that the j-th token has been matched by the padding token in seq1
            elif seq1[i]==x and seq2[j]!=x:
                seq1_aligned.append(padding)
                # Get the index of where this new padding was inserted
                seq1_pad_indexes.append(len(seq1_aligned)-1)
                seq2_aligned.append(seq2[j])
                j+=1

            # Case 3 and 4: seq1[i]!=x but seq[2]==x as well as seq1[i]!=x but seq[2]!=x
            # Action
            # - append out-of-lcs token seq1[i] to seq1_aligned
            # - append padding to seq2_aligned to match the out-of-lcs token from seq1
            # - i+1 to simulate that the i-th token has been matched by the padding token in seq2
            # For the case where both don't match, we use the same logic. It'll result
            # in matched padding to out-of-lcs tokens in both seqs
            else:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(padding)
                i+=1

    # Once all the lcs tokens have been aligned
    # for the out-of-lcs token in one sequence, align it with a padding token in the other sequence
    while i < len(seq1):
        seq1_aligned.append(seq1[i])
        seq2_aligned.append(padding)
        i+=1

    while j < len(seq2):
        seq1_aligned.append(padding)
        seq2_aligned.append(seq2[j])
        j+=1

    return seq1_aligned, seq2_aligned, seq1_pad_indexes


**Testing indexes**

In [56]:
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned, seq1_pad_indexes = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)
print(seq1_pad_indexes)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']
[1]


In [57]:
seq1 = ['a','j','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned, seq1_pad_indexes = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)
print(seq1_pad_indexes)

['a', 'j', 'Pad', 'Pad', 'c', 'd', 'e']
['a', 'Pad', 'z', 'b', 'c', 'Pad', 'e']
[2, 3]


In [59]:
seq1 = ['a','b','c','e']
seq2 = ['a', 'Pad', 'b', 'c', 'd', 'e']
seq1_aligned, seq2_aligned, seq1_pad_indexes = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)
print(seq1_pad_indexes)

['a', 'Pad', 'b', 'c', 'Pad', 'e']
['a', 'Pad', 'b', 'c', 'd', 'e']
[1, 4]


Force align seq1 and seq2 and get new seq1

Force align seq1 and seq3 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align

Force align seq1 and seq4 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align
- insert in seq3 to re-align

Force align seq1 and seq5 and get new seq1
If there are new padding tokens in seq1:
- insert in seq2 to re-align
- insert in seq3 to re-align
- insert in seq4 to re-align

In [62]:
seq1 = ['a', 'b', 'c', 'd', 'e']
seq2 = ['a', 'b', 'c', 'e']
seq3 = ['a', 'b', 'c', 'd', 'e']
seq4 = ['a', 'b', 'c', 'e']
seq5 = ['a', 'z', 'b', 'c', 'e']

compare_seqs = [seq2,seq3,seq4,seq5]
aligned_seqs = []

for i in range(len(compare_seqs)):
    # Force align sequence 1 and the i-th seq to compare
    seq1_aligned, seq2_aligned, seq1_pad_indexes = force_align(seq1,compare_seqs[i])
    # Append the i-th seq that has been aligned
    aligned_seqs.append(seq2_aligned)
    # If we have compared the i-th >= 1 sequence i.e. after seq2
    if i >= 1:
        # Insert the new padding tokens to re-align the previously aligned seqs
        for j in range(i):
            for new_pad_idx in seq1_pad_indexes:
                aligned_seqs[j].insert(new_pad_idx,"Pad")

    if i==len(compare_seqs)-1:
        aligned_seqs.insert(0,seq1_aligned)

In [63]:
aligned_seqs

[['a', 'Pad', 'b', 'c', 'd', 'e'],
 ['a', 'Pad', 'b', 'c', 'Pad', 'e'],
 ['a', 'Pad', 'b', 'c', 'd', 'e'],
 ['a', 'Pad', 'b', 'c', 'Pad', 'e'],
 ['a', 'z', 'b', 'c', 'Pad', 'e']]

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```