# Extract Noise Embedding

## Get the Longest Common Subsequence

In [1]:
def LCS(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]

    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    # The very first element stores the LCS for the 2 full sequences, building up its value in the bottom up approach
    return dp[0][0]

In [2]:
LCS('abcde','ace')

3

In [3]:
def LCS_return_seq(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if seq1[i]==seq2[j]:
            lcs.append(seq1[i])
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

In [4]:
LCS_return_seq('abcde','ace')

['a', 'c', 'e']

## Aligning 2 sequences

In [5]:
def force_align(seq1,seq2):
    # Get the lcs between the 2 sequences
    lcs = LCS_return_seq(seq1, seq2)

    seq1_aligned = []
    seq2_aligned = []

    i = 0
    j = 0

    padding = "Pad"
    next_x = False

    """
    Big Idea: 
    - align the lcs tokens
    - for the out-of-lcs token in one sequence, align it with a padding token in the other sequence 
      to denote token level noise
    """
    for x in lcs:
        next_x = False
        while not next_x:

            # Case 1: seq1[i]==seq2[j]==x
            # Action
            # - append seq[i] to seq1_aligned
            # - append seq[j] to seq2_aligned
            # - i+1
            # - j+1
            if seq1[i]==x and seq2[j]==x:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(seq2[j])
                i+=1
                j+=1
                # Move to the next x in lcs
                next_x = True

            # Case 2: seq1[i]==x but seq[2]!=x
            # Action
            # - append padding to seq1_aligned to match the out-of-lcs token from seq2
            # - append out-of-lcs token seq2[j] to seq2_aligned
            # - j+1 to simulate that the j-th token has been matched by the padding token in seq1
            elif seq1[i]==x and seq2[j]!=x:
                seq1_aligned.append(padding)
                seq2_aligned.append(seq2[j])
                j+=1

            # Case 3 and 4: seq1[i]!=x but seq[2]==x as well as seq1[i]!=x but seq[2]!=x
            # Action
            # - append out-of-lcs token seq1[i] to seq1_aligned
            # - append padding to seq2_aligned to match the out-of-lcs token from seq1
            # - i+1 to simulate that the i-th token has been matched by the padding token in seq2
            # For the case where both don't match, we use the same logic. It'll result
            # in matched padding to out-of-lcs tokens in both seqs
            else:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(padding)
                i+=1

    # Once all the lcs tokens have been aligned
    # for the out-of-lcs token in one sequence, align it with a padding token in the other sequence
    while i < len(seq1):
        seq1_aligned.append(seq1[i])
        seq2_aligned.append(padding)
        i+=1

    while j < len(seq2):
        seq1_aligned.append(padding)
        seq2_aligned.append(seq2[j])
        j+=1

    return seq1_aligned, seq2_aligned


**Testing Case where 1 col doesn't match**

In [6]:
# Case where one col doesn't match
seq1 = ['a','j','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'j', 'Pad', 'Pad', 'c', 'd', 'e']
['a', 'Pad', 'z', 'b', 'c', 'Pad', 'e']


**Testing 2 Sequences using the given example**

**Given example**

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```

In [7]:
# 1 and 2
seq1 = ['a','b','c','d','e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']


In [8]:
# 1 and random case
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','g','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'Pad', 'c', 'd', 'e']
['a', 'z', 'b', 'g', 'c', 'Pad', 'e']


In [9]:
# 1 and 5
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']


In [10]:
# 1 (after 5) and 2
seq1 = ['a', 'Pad', 'b', 'c', 'd', 'e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']


<br/>
<br/>
<br/>

## Aligning N hypotheses

**What if we have a case where we have 2 different LCS**

In [11]:
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']
seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']

In [12]:
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'e']


In [13]:
seq1_aligned, seq2_aligned = force_align(seq1,seq3)
print(seq1_aligned)
print(seq2_aligned)

['a', 'q', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']


In [14]:
seq1_aligned, seq2_aligned = force_align(seq3,seq4)
print(seq1_aligned)
print(seq2_aligned)

['a', 'u', 'Pad', 'Pad', 'g', 'v', 'p', 'b', 'Pad', 'e']
['a', 'Pad', 'z', 'w', 'g', 'Pad', 'p', 'Pad', 'x', 'e']


<br/>
<br/>
<br/>

**Approach**:

Step 1:

Align sequence 1 and sequence 2. We get:
```
seq1 = ['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
seq2 = ['a', 'Pad', 's', 'c', 'Pad', 't', 'e']
```

Step 2:

Extract the tokens from the aligned sequence. This will help align the new sequence properly to both sequence 1 and sequence 2:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'e']
```

Step 3:

Align the new sequence to the seq_intersection of the results. 

This ensures that we align the new sequence with all other sequences in the results:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'e']
seq3 = ['a','u','g','v','p','b','e']
```

The result is:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
seq3 = ['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']
```

Step 4:

Re-align sequence 1 and sequence 2 by inserting 'Pad' tokens at where they were inserted in the seq_intersection

This is because that is the padding required to be added to sequence 1 and sequence 2


seq1      = ['a', 'q', 'Pad', 'c', 'r', 'Pad',                                       'e'] (index i)
seq2      = ['a', 'Pad', 's', 'c', 'Pad', 't',                                       'e'] (index j)
token_seq = ['a', 'q', 's', 'c', 'r', 't',        'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e'] (index k)
```

In [15]:
def align_new_seq(results,new_seq):
    # Get the intersection of the sequences i.e. only the tokens and no padding tokens
    seq_intersection = []
    # For each column in the aligned sequences in the results,
    # get the non-padding token, which is the same throughout the column
    for col in zip(*results):
        tokens = [t for t in col if t != 'Pad']
        seq_intersection.append(tokens[0])

    print("Results")
    for res in results:
        print(res)
    print("")


    print("Sequence Intersection: ")
    print(seq_intersection)
    print("")

    print("New Sequence:")
    print(new_seq)
    print("")

    # Align the new sequence with the sequence intersection of the results. This ensures that we align the new sequence
    # with all other sequences in the results
    seq_intersection_aligned, new_seq_aligned = force_align(seq_intersection,new_seq)

    print("Sequence Intersection aligned: ")
    print(seq_intersection_aligned)
    print("")

    print("New Sequence Aligned:")
    print(new_seq_aligned)
    print("")

    # Initialise the column pointer
    col_pointer = 0
    # Initialise the newly aligned results
    aligned_results = [[] for _ in range(len(results))] 

    # For each token in the aligned sequence intersection
    for k in range(len(seq_intersection_aligned)):
        # If the token in the aligned sequence intersection was a padding token, add it to the rest of the results to align them
        if seq_intersection_aligned[k]=='Pad':
            for i in range(len(results)):
                 aligned_results[i].append('Pad')
        # If not, add the current elements from the col in the results as they are still aligned with the new sequence intersection
        # move the column pointer as well
        else:
            for i in range(len(results)):
                 aligned_results[i].append(results[i][col_pointer])
            col_pointer+=1

    
    aligned_results.append(new_seq_aligned)

    print("Aligned Results")
    for res in aligned_results:
        print(res)

    print("")
    print("")
    print("")
    print("")
    print("")

    return aligned_results


def force_align_n_seqs(seqs):
    # First force align 2 sequences
    seq1,seq2 = force_align(seqs[0],seqs[1])

    result = [seq1,seq2]

    # Next, force align the sequences progressively
    for seq in seqs[2:]:
        result = align_new_seq(result,seq)
        
    return result

**Test Case 1 using cases of an LCS between seq1 and seq2, as well as between seq3 and seq4, and common seq between all 3**

```
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']

seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']
```

In [16]:
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']
seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']
seqs = [seq1,seq2,seq3,seq4]
results = force_align_n_seqs(seqs)

Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'e']

Sequence Intersection: 
['a', 'q', 's', 'c', 'r', 't', 'e']

New Sequence:
['a', 'u', 'g', 'v', 'p', 'b', 'e']

Sequence Intersection aligned: 
['a', 'q', 's', 'c', 'r', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']

New Sequence Aligned:
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']

Aligned Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']





Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']

Sequence Intersection: 
['a', 'q', 's', 'c', 'r', 't', 'u', 'g', 'v', 'p', 'b', 'e']

New Sequence:
['a', 'z', 'w', 'g', 'p', 'x'

In [17]:
print("Input:")
for seq in seqs:
    print(seq)
print("")
print("Output:")
for seq in results:
    print(seq)

Input:
['a', 'q', 'c', 'r', 'e']
['a', 's', 'c', 't', 'e']
['a', 'u', 'g', 'v', 'p', 'b', 'e']
['a', 'z', 'w', 'g', 'p', 'x', 'e']

Output:
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'Pad', 'Pad', 'g', 'v', 'p', 'b', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'z', 'w', 'g', 'Pad', 'p', 'Pad', 'x', 'e']


<br/>

**Test Case 2 using the given example**

**Given example**

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```

In [18]:
seq1 = ['a','b','c','d','e']
seq2 = ['a','b','c','e']
seq3 = ['a','b','c','d','e']
seq4 = ['a','b','c','e']
seq5 = ['a','z','b','c','e']
seqs = [seq1,seq2,seq3,seq4,seq5]
results = force_align_n_seqs(seqs)

Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'b', 'c', 'd', 'e']

Sequence Intersection aligned: 
['a', 'b', 'c', 'd', 'e']

New Sequence Aligned:
['a', 'b', 'c', 'd', 'e']

Aligned Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']





Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'b', 'c', 'e']

Sequence Intersection aligned: 
['a', 'b', 'c', 'd', 'e']

New Sequence Aligned:
['a', 'b', 'c', 'Pad', 'e']

Aligned Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']





Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'z', 'b', 'c', 'e']

Sequence Int

In [19]:
print("Input:")
for seq in seqs:
    print(seq)
print("")
print("Output:")
for seq in results:
    print(seq)

Input:
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'e']
['a', 'z', 'b', 'c', 'e']

Output:
['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']
['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']


<br/>
<br/>
<br/>

<br/>
<br/>
<br/>

```
Deprecated Code
def align_new_seq(results,new_seq):
    # Get the intersection of the sequences i.e. only the tokens and no padding tokens
    seq_intersection = []
    # For each column in the aligned sequences in the results,
    # get the non-padding token, which is the same throughout the column
    for col in zip(*results):
        tokens = [t for t in col if t != 'Pad']
        seq_intersection.append(tokens[0])

    print("Results")
    for res in results:
        print(res)
    print("")


    print("Sequence Intersection: ")
    print(seq_intersection)
    print("")

    print("New Sequence:")
    print(new_seq)
    print("")

    # Align the new sequence with the sequence intersection. This ensures that we align the new sequence
    # with all other sequences in the results
    seq_intersection_aligned, new_seq_aligned = force_align(seq_intersection,new_seq)

    print("Sequence Intersection aligned: ")
    print(seq_intersection_aligned)
    print("")

    print("New Sequence Aligned:")
    print(new_seq_aligned)
    print("")

    # Initialise the column pointer
    col_pointer = 0
    # Initialise the newly aligned results
    aligned_results = [[] for _ in range(len(results))] 

    # For each token in the aligned sequence intersection
    for k in range(len(seq_intersection_aligned)):
        # Check if it at least one of the tokens in the column of the result matches the current sequence intersection token
        aligned = False
        # For each result, check if its value in the current column matches the current sequence intersection token
        for i in range(len(results)): 
            if results[i][col_pointer]==seq_intersection_aligned[k]:
                # This means that there is no new padding in the modified sequence intersection at this position,
                # as it still uses the tokens from our aligned sequence
                aligned = True 
                break
        # If there was an alignment with the modified sequence intersection at this position
        if aligned:
             # Add the current tokens in the column to the newly aligned results and 
             # advance the column pointer to compare the next column
             for i in range(len(results)):
                 aligned_results[i].append(results[i][col_pointer])
             col_pointer+=1
        # If there was no alignment with the modified sequence intersection at this position
        # it means we have to add a padding token
        else:
            for i in range(len(results)):
                 aligned_results[i].append('Pad')

    
    aligned_results.append(new_seq_aligned)

    print("Aligned Results")
    for res in aligned_results:
        print(res)

    print("")
    print("")
    print("")
    print("")
    print("")

    return aligned_results


def force_align_n_seqs(seqs):
    # First force align 2 sequences
    seq1,seq2 = force_align(seqs[0],seqs[1])

    result = [seq1,seq2]

    # Next, force align the sequences progressively
    for seq in seqs[2:]:
        result = align_new_seq(result,seq)
        
    return result
```

<br/>
<br/>
<br/>

<br/>
<br/>
<br/>