# Extract Noise Embedding

## Get the Longest Common Subsequence

In [1]:
def LCS(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]

    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    # The very first element stores the LCS for the 2 full sequences, building up its value in the bottom up approach
    return dp[0][0]

In [2]:
LCS('abcde','ace')

3

In [74]:
def LCS_return_seq(seq1, seq2):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if seq1[i]==seq2[j]:
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if seq1[i]==seq2[j]:
            lcs.append(seq1[i])
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

In [75]:
LCS_return_seq('abcde','ace')

['a', 'c', 'e']

In [76]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "am", "enjoys", "listening", "music"]
LCS_return_seq(seq1,seq2)

['I', 'enjoys', 'listening', 'music']

## Aligning 2 sequences

In [5]:
def force_align(seq1,seq2):
    # Get the lcs between the 2 sequences
    lcs = LCS_return_seq(seq1, seq2)

    seq1_aligned = []
    seq2_aligned = []

    i = 0
    j = 0

    padding = "Pad"
    next_x = False

    """
    Big Idea: 
    - align the lcs tokens
    - for the out-of-lcs token in one sequence, align it with a padding token in the other sequence 
      to denote token level noise
    """
    for x in lcs:
        next_x = False
        while not next_x:

            # Case 1: seq1[i]==seq2[j]==x
            # Action
            # - append seq[i] to seq1_aligned
            # - append seq[j] to seq2_aligned
            # - i+1
            # - j+1
            if seq1[i]==x and seq2[j]==x:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(seq2[j])
                i+=1
                j+=1
                # Move to the next x in lcs
                next_x = True

            # Case 2: seq1[i]==x but seq[2]!=x
            # Action
            # - append padding to seq1_aligned to match the out-of-lcs token from seq2
            # - append out-of-lcs token seq2[j] to seq2_aligned
            # - j+1 to simulate that the j-th token has been matched by the padding token in seq1
            elif seq1[i]==x and seq2[j]!=x:
                seq1_aligned.append(padding)
                seq2_aligned.append(seq2[j])
                j+=1

            # Case 3 and 4: seq1[i]!=x but seq[2]==x as well as seq1[i]!=x but seq[2]!=x
            # Action
            # - append out-of-lcs token seq1[i] to seq1_aligned
            # - append padding to seq2_aligned to match the out-of-lcs token from seq1
            # - i+1 to simulate that the i-th token has been matched by the padding token in seq2
            # For the case where both don't match, we use the same logic. It'll result
            # in matched padding to out-of-lcs tokens in both seqs
            else:
                seq1_aligned.append(seq1[i])
                seq2_aligned.append(padding)
                i+=1

    # Once all the lcs tokens have been aligned
    # for the out-of-lcs token in one sequence, align it with a padding token in the other sequence
    while i < len(seq1):
        seq1_aligned.append(seq1[i])
        seq2_aligned.append(padding)
        i+=1

    while j < len(seq2):
        seq1_aligned.append(padding)
        seq2_aligned.append(seq2[j])
        j+=1

    return seq1_aligned, seq2_aligned


**Testing Case where 1 col doesn't match**

In [6]:
# Case where one col doesn't match
seq1 = ['a','j','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'j', 'Pad', 'Pad', 'c', 'd', 'e']
['a', 'Pad', 'z', 'b', 'c', 'Pad', 'e']


**Testing 2 Sequences using the given example**

**Given example**

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```

In [7]:
# 1 and 2
seq1 = ['a','b','c','d','e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']


In [8]:
# 1 and random case
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','g','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'Pad', 'c', 'd', 'e']
['a', 'z', 'b', 'g', 'c', 'Pad', 'e']


In [9]:
# 1 and 5
seq1 = ['a','b','c','d','e']
seq2 = ['a','z','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']


In [10]:
# 1 (after 5) and 2
seq1 = ['a', 'Pad', 'b', 'c', 'd', 'e']
seq2 = ['a','b','c','e']
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']


<br/>
<br/>
<br/>

## Aligning N hypotheses

**What if we have a case where we have 2 different LCS**

In [11]:
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']
seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']

In [12]:
seq1_aligned, seq2_aligned = force_align(seq1,seq2)
print(seq1_aligned)
print(seq2_aligned)

['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'e']


In [13]:
seq1_aligned, seq2_aligned = force_align(seq1,seq3)
print(seq1_aligned)
print(seq2_aligned)

['a', 'q', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']


In [14]:
seq1_aligned, seq2_aligned = force_align(seq3,seq4)
print(seq1_aligned)
print(seq2_aligned)

['a', 'u', 'Pad', 'Pad', 'g', 'v', 'p', 'b', 'Pad', 'e']
['a', 'Pad', 'z', 'w', 'g', 'Pad', 'p', 'Pad', 'x', 'e']


<br/>
<br/>
<br/>

**Approach**:

Step 1:

Align sequence 1 and sequence 2. We get:
```
seq1 = ['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
seq2 = ['a', 'Pad', 's', 'c', 'Pad', 't', 'e']
```

Step 2:

Extract the tokens from the aligned sequence. This will help align the new sequence properly to both sequence 1 and sequence 2:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'e']
```

Step 3:

Align the new sequence to the seq_intersection of the results. 

This ensures that we align the new sequence with all other sequences in the results:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'e']
seq3 = ['a','u','g','v','p','b','e']
```

The result is:
```
seq_intersection = ['a', 'q', 's', 'c', 'r', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
seq3 = ['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']
```

Step 4:

Re-align sequence 1 and sequence 2 by inserting 'Pad' tokens at where they were inserted in the seq_intersection

This is because that is the padding required to be added to sequence 1 and sequence 2


seq1      = ['a', 'q', 'Pad', 'c', 'r', 'Pad',                                       'e'] (index i)
seq2      = ['a', 'Pad', 's', 'c', 'Pad', 't',                                       'e'] (index j)
token_seq = ['a', 'q', 's', 'c', 'r', 't',        'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e'] (index k)
```

In [15]:
def align_new_seq(results,new_seq):
    # Get the intersection of the sequences i.e. only the tokens and no padding tokens
    seq_intersection = []
    # For each column in the aligned sequences in the results,
    # get the non-padding token, which is the same throughout the column
    for col in zip(*results):
        tokens = [t for t in col if t != 'Pad']
        seq_intersection.append(tokens[0])

    print("Results")
    for res in results:
        print(res)
    print("")


    print("Sequence Intersection: ")
    print(seq_intersection)
    print("")

    print("New Sequence:")
    print(new_seq)
    print("")

    # Align the new sequence with the sequence intersection of the results. This ensures that we align the new sequence
    # with all other sequences in the results
    seq_intersection_aligned, new_seq_aligned = force_align(seq_intersection,new_seq)

    print("Sequence Intersection aligned: ")
    print(seq_intersection_aligned)
    print("")

    print("New Sequence Aligned:")
    print(new_seq_aligned)
    print("")

    # Initialise the column pointer
    col_pointer = 0
    # Initialise the newly aligned results
    aligned_results = [[] for _ in range(len(results))] 

    # For each token in the aligned sequence intersection
    for k in range(len(seq_intersection_aligned)):
        # If the token in the aligned sequence intersection was a padding token, add it to the rest of the results to align them
        if seq_intersection_aligned[k]=='Pad':
            for i in range(len(results)):
                 aligned_results[i].append('Pad')
        # If not, add the current elements from the col in the results as they are still aligned with the new sequence intersection
        # move the column pointer as well
        else:
            for i in range(len(results)):
                 aligned_results[i].append(results[i][col_pointer])
            col_pointer+=1

    
    aligned_results.append(new_seq_aligned)

    print("Aligned Results")
    for res in aligned_results:
        print(res)

    print("")
    print("")
    print("")
    print("")
    print("")

    return aligned_results


def force_align_n_seqs(seqs):
    # First force align 2 sequences
    seq1,seq2 = force_align(seqs[0],seqs[1])

    result = [seq1,seq2]

    # Next, force align the sequences progressively
    for seq in seqs[2:]:
        result = align_new_seq(result,seq)
        
    return result

**Test Case 1 using cases of an LCS between seq1 and seq2, as well as between seq3 and seq4, and common seq between all 3**

```
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']

seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']
```

The LCS between seq1 and seq2 are: a c e

The LCS between seq3 and seq4 are: a g p e

the LCS between all seqs are: a e

In [16]:
seq1 = ['a','q','c','r','e']
seq2 = ['a','s','c','t','e']
seq3 = ['a','u','g','v','p','b','e']
seq4 = ['a','z','w','g','p','x','e']
seqs = [seq1,seq2,seq3,seq4]
results = force_align_n_seqs(seqs)

Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'e']

Sequence Intersection: 
['a', 'q', 's', 'c', 'r', 't', 'e']

New Sequence:
['a', 'u', 'g', 'v', 'p', 'b', 'e']

Sequence Intersection aligned: 
['a', 'q', 's', 'c', 'r', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']

New Sequence Aligned:
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']

Aligned Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']





Results
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'g', 'v', 'p', 'b', 'e']

Sequence Intersection: 
['a', 'q', 's', 'c', 'r', 't', 'u', 'g', 'v', 'p', 'b', 'e']

New Sequence:
['a', 'z', 'w', 'g', 'p', 'x'

In [17]:
print("Input:")
for seq in seqs:
    print(seq)
print("")
print("Output:")
for seq in results:
    print(seq)

Input:
['a', 'q', 'c', 'r', 'e']
['a', 's', 'c', 't', 'e']
['a', 'u', 'g', 'v', 'p', 'b', 'e']
['a', 'z', 'w', 'g', 'p', 'x', 'e']

Output:
['a', 'q', 'Pad', 'c', 'r', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 's', 'c', 'Pad', 't', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'u', 'Pad', 'Pad', 'g', 'v', 'p', 'b', 'Pad', 'e']
['a', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'Pad', 'z', 'w', 'g', 'Pad', 'p', 'Pad', 'x', 'e']


<br/>

**Test Case 2 using the given example**

**Given example**

Input:

```
1. a b c d e
2. a b c e
3. a b c d e
4. a b c e
5. a z b c e
```

Output:


```
1. a pad b c d   e
2. a pad b c pad e
3. a pad b c d   e
4. a pad b c pad e
5. a z   b c pad e
```

In [18]:
seq1 = ['a','b','c','d','e']
seq2 = ['a','b','c','e']
seq3 = ['a','b','c','d','e']
seq4 = ['a','b','c','e']
seq5 = ['a','z','b','c','e']
seqs = [seq1,seq2,seq3,seq4,seq5]
results = force_align_n_seqs(seqs)

Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'b', 'c', 'd', 'e']

Sequence Intersection aligned: 
['a', 'b', 'c', 'd', 'e']

New Sequence Aligned:
['a', 'b', 'c', 'd', 'e']

Aligned Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']





Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'b', 'c', 'e']

Sequence Intersection aligned: 
['a', 'b', 'c', 'd', 'e']

New Sequence Aligned:
['a', 'b', 'c', 'Pad', 'e']

Aligned Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']





Results
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'Pad', 'e']

Sequence Intersection: 
['a', 'b', 'c', 'd', 'e']

New Sequence:
['a', 'z', 'b', 'c', 'e']

Sequence Int

In [19]:
print("Input:")
for seq in seqs:
    print(seq)
print("")
print("Output:")
for seq in results:
    print(seq)

Input:
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'e']
['a', 'b', 'c', 'd', 'e']
['a', 'b', 'c', 'e']
['a', 'z', 'b', 'c', 'e']

Output:
['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']
['a', 'Pad', 'b', 'c', 'd', 'e']
['a', 'Pad', 'b', 'c', 'Pad', 'e']
['a', 'z', 'b', 'c', 'Pad', 'e']


<br/>
<br/>
<br/>

## Lang-Space Noise Embedding: Extract token-level noise embedding

## Doing the actual token level noise embedding

- Replace the characters with token embeddings 
- Replace equality with similarity
- SBERT is used to extract the token embedding

Actual Sequences

```
["I", "enjoys", "listening", "to", "music"]
["I", "enjoy", "listen", "music"]
["I", "join", "listening", "to", "music"]
["I", "enjoy", "listened", "mystic"]
["I", "am", "join", "listening", "music"]
```

In [1]:
import jellyfish
from rapidfuzz import fuzz

def similar_syntax_ver1(w1,w2,l1,threshold):
    """
    Uses the indel similarity between the words and the indel similarity between the approx phonetic representations of the words
    """
    sim = l1*fuzz.ratio(w1,w2) + (1-l1)*fuzz.ratio(jellyfish.metaphone(w1),jellyfish.metaphone(w2))
    if sim > threshold:
        print(f"{w1} and {w2} are similar with a score of {sim}")
        print(f"The indel similarity between words is {fuzz.ratio(w1,w2)}")
        print(f"The indel similarity between their phonetics is {fuzz.ratio(jellyfish.metaphone(w1),jellyfish.metaphone(w2))}")
        print("")
        return True
    return False

In [2]:
import jellyfish
from rapidfuzz import fuzz

def similar_syntax_ver2(w1,w2,threshold):
    """
    Uses the indel similarity between the words 
    """
    sim = fuzz.ratio(w1,w2) 

    return sim

    if sim > threshold:
        #print(f"{w1} and {w2} are similar with a score of {sim}")
        #print("")
        return True
    return False

In [3]:
from rapidfuzz.distance import Levenshtein

def similar_syntax_ver3(w1,w2,threshold):
    """
    Uses the Levenshtein similarity between the words
    """
    sim = Levenshtein.normalized_similarity(w1,w2)*100

    return sim
    if sim > threshold:
        #print(f"{w1} and {w2} are similar with a score of {sim}")
        #print("")
        return True
    return False

In [4]:
def LCS_ver1(seq1, seq2, l1,threshold):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if similar_syntax_ver1(seq1[i],seq2[j],l1,threshold):
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if similar_syntax_ver1(seq1[i],seq2[j],l1,threshold):
            # By default, we take the word from sequence 1. 
            # Note that although the corresponding word in sequence 2 may be different, it is still
            # syntactically different
            lcs.append(seq1[i]) 
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

In [5]:
def LCS_ver2(seq1, seq2,threshold):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if similar_syntax_ver2(seq1[i],seq2[j],threshold):
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if similar_syntax_ver2(seq1[i],seq2[j],threshold):
            # By default, we take the word from sequence 1. 
            # Note that although the corresponding word in sequence 2 may be different, it is still
            # syntactically different
            lcs.append(seq1[i]) 
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

In [6]:
def LCS_ver3(seq1, seq2,threshold):
    """
    Initialise a DP array of len(seq2)+1 columns and len(seq1)+1 rows
    the extra column and row is to denote the empty sequence as a base case
    """
    dp = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
    # Fill up the DP array where each cell contains the LCS of the subproblems
    # Starting from the bottom right most cell and moving from right to left, start the bottom up approach
    for i in range(len(seq1)-1,-1,-1):
        for j in range(len(seq2)-1,-1,-1):
            # If the elements of seq1 and seq2 match, store a 1 + value at the diagonal cell
            # Store 1 because the elements match
            # Get value from diagonal cell because both elements match so our subproblem moves (i+1,j+1)
            if similar_syntax_ver3(seq1[i],seq2[j],threshold):
                dp[i][j] = 1 + dp[i+1][j+1]
            # If the elements of seq1 and seq2 do not match, get the value from its right or bottom cell, taking the max
            # We do this to get the max longest common sub sequence of our sub problems after moving (i+1,j) or (i,j+1)
            else:
                dp[i][j] = max(dp[i][j+1], dp[i+1][j])
    
    # Get the actual subsequence
    # Re-initialise the pointers
    i = 0
    j = 0
    lcs = []

    while i < len(seq1) and j < len(seq2):
        # If the characters match at those positions, add the character
        if similar_syntax_ver3(seq1[i],seq2[j],threshold):
            # By default, we take the word from sequence 1. 
            # Note that although the corresponding word in sequence 2 may be different, it is still
            # syntactically different
            lcs.append(seq1[i]) 
            # Move diagonally as our subproblem now becomes i+1,j+1
            i+=1
            j+=1
        # If the characters don't match at that cell, we try going to the cell
        # with the greater value (either the right or down cell which are our subproblems)
        # We go to the cell with the greater value because a match was found on or near that cell
        elif dp[i+1][j]>=dp[i][j+1]:
            i+=1
        else:
            j+=1

    return lcs

**Test LCS for sequences**

```
["I", "enjoys", "listening", "to", "music"]
["I", "enjoy", "listen", "music"]
["I", "join", "listening", "to", "music"]
["I", "enjoy", "listened", "mystic"]
["I", "am", "join", "listening", "music"]
```

Note: Although the corresponding word in sequence 2 may be different, it is still syntactically different.

Our choice of retrieving the common word from sequence 1 is arbitrary

**Testing for weightage with indel similarity between words <u>and</u> indel similarity between phonetic representations with a threshold of .35 amd .3**

In [7]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "am", "join", "listening", "music"]
LCS_ver1(seq1,seq2,0.5,35)

music and music are similar with a score of 100.0
The indel similarity between words is 100.0
The indel similarity between their phonetics is 100.0

music and listening are similar with a score of 36.507936507936506
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 44.44444444444444

listening and music are similar with a score of 36.507936507936506
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 44.44444444444444

listening and listening are similar with a score of 100.0
The indel similarity between words is 100.0
The indel similarity between their phonetics is 100.0

enjoys and join are similar with a score of 36.66666666666667
The indel similarity between words is 40.0
The indel similarity between their phonetics is 33.333333333333336

I and I are similar with a score of 100.0
The indel similarity between words is 100.0
The indel similarity between their phonetics is 100.0



['I', 'enjoys', 'listening', 'music']

From this, it seems like ```enjoys and join``` as well as ```listening and music``` have similar levels of similarity

<br/>

In [8]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "am", "join", "listening", "music"]
LCS_ver1(seq1,seq2,0.5,30)

music and music are similar with a score of 100.0
The indel similarity between words is 100.0
The indel similarity between their phonetics is 100.0

music and listening are similar with a score of 36.507936507936506
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 44.44444444444444

music and am are similar with a score of 34.285714285714285
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 40.0

listening and music are similar with a score of 36.507936507936506
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 44.44444444444444

listening and listening are similar with a score of 100.0
The indel similarity between words is 100.0
The indel similarity between their phonetics is 100.0

enjoys and join are similar with a score of 36.66666666666667
The indel similarity between words is 40.0
The indel similarity between their phon

['I', 'enjoys', 'listening', 'music']

It seems like indel similarity between words is a better choice from the following results

```
music and listening are similar with a score of 36.507936507936506
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 44.44444444444444

music and am are similar with a score of 34.285714285714285
The indel similarity between words is 28.57142857142857
The indel similarity between their phonetics is 40.0

enjoys and join are similar with a score of 36.66666666666667
The indel similarity between words is 40.0
The indel similarity between their phonetics is 33.333333333333336
```

**Testing for weightage with indel similarity between words only**

In [99]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "am", "join", "listening", "music"]
LCS_ver2(seq1,seq2,35)

music and music are similar with a score of 100.0
The indel similarity between words is 100.0

listening and listening are similar with a score of 100.0
The indel similarity between words is 100.0

enjoys and join are similar with a score of 40.0
The indel similarity between words is 40.0

I and I are similar with a score of 100.0
The indel similarity between words is 100.0

I and I are similar with a score of 100.0
The indel similarity between words is 100.0

enjoys and join are similar with a score of 40.0
The indel similarity between words is 40.0

listening and listening are similar with a score of 100.0
The indel similarity between words is 100.0

music and music are similar with a score of 100.0
The indel similarity between words is 100.0



['I', 'enjoys', 'listening', 'music']

**Testing for weightage with Levenshtein similarity between words only**

In [110]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "am", "join", "listening", "music"]
LCS_ver3(seq1,seq2,30)

music and music are similar with a score of 100.0

listening and listening are similar with a score of 100.0

enjoys and join are similar with a score of 33.333333333333336

I and I are similar with a score of 100.0

I and I are similar with a score of 100.0

enjoys and join are similar with a score of 33.333333333333336

listening and listening are similar with a score of 100.0

music and music are similar with a score of 100.0



['I', 'enjoys', 'listening', 'music']

**Testing all pairwise similarities for indel similarity vs levenshtein similarity for our example sequences**

In [115]:
seq1 = ["I", "enjoys", "listening", "to", "music"]
seq2 = ["I", "enjoy", "listen", "music"]
seq3 = ["I", "join", "listening", "to", "music"]
seq4 = ["I", "enjoy", "listened", "mystic"]
seq5 = ["I", "am", "join", "listening", "music"]

all_tokens = set(seq1+seq2+seq3+seq4+seq5)

import itertools
pairwise = list(itertools.combinations(all_tokens, 2))


In [119]:
for pair in pairwise:
    print(f"Pair: {pair}")
    print(f"Sim for indel: {similar_syntax_ver2(pair[0],pair[1],0)}")
    print(f"Sim for lev: {similar_syntax_ver3(pair[0],pair[1],0)}")
    print("")

Pair: ('I', 'enjoys')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'am')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'mystic')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'listening')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'join')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'music')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'listen')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'listened')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'to')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('I', 'enjoy')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('enjoys', 'am')
Sim for indel: 0.0
Sim for lev: 0.0

Pair: ('enjoys', 'mystic')
Sim for indel: 33.333333333333336
Sim for lev: 0.0

Pair: ('enjoys', 'listening')
Sim for indel: 26.66666666666667
Sim for lev: 11.111111111111116

Pair: ('enjoys', 'join')
Sim for indel: 40.0
Sim for lev: 33.333333333333336

Pair: ('enjoys', 'music')
Sim for indel: 18.181818181818176
Sim for lev: 0.0

Pair: ('enjoys', 'listen')

**If we use indel: Setting enjoys and join as well as enjoy and join as a baseline**

```
Pair: ('enjoys', 'join')
Sim for indel: 40.0
Sim for lev: 33.333333333333336

Pair: ('join', 'enjoy')
Sim for indel: 44.44444444444444
Sim for lev: 19.999999999999996

Pair: ('mystic', 'listening')
Sim for indel: 40.0
Sim for lev: 33.333333333333336

Pair: ('join', 'listen')
Sim for indel: 40.0
Sim for lev: 16.666666666666664
```

**If we use lev: Setting enjoys and join as well as enjoy and join as a baseline**

```
Pair: ('enjoys', 'join')
Sim for indel: 40.0
Sim for lev: 33.333333333333336

Pair: ('join', 'enjoy')
Sim for indel: 44.44444444444444
Sim for lev: 19.999999999999996

Pair: ('mystic', 'listening')
Sim for indel: 40.0
Sim for lev: 33.333333333333336

Pair: ('mystic', 'listen')
Sim for indel: 33.333333333333336
Sim for lev: 33.333333333333336

Pair: ('listening', 'join')
Sim for indel: 30.76923076923077
Sim for lev: 22.22222222222222

Pair: ('listening', 'music')
Sim for indel: 28.57142857142857
Sim for lev: 22.22222222222222

Pair: ('listening', 'enjoy')
Sim for indel: 28.57142857142857
Sim for lev: 22.22222222222222

Pair: ('join', 'music')
Sim for indel: 22.22222222222222
Sim for lev: 19.999999999999996

Pair: ('join', 'to')
Sim for indel: 33.333333333333336
Sim for lev: 25.0

Pair: ('to', 'enjoy')
Sim for indel: 28.57142857142857
Sim for lev: 19.999999999999996
```

- We might have unwanted alignment if the threshold is set to low
- Ideally we should get a high similarity for those words we want to be the same vice-versa
- Realistically, the Nbest transcriptions should not be too far off
- If there is a match, the pointers will move down

**Testing all pairwise similarities for Jaccard similarity for our example sequences**

In [120]:
def jaccard_similarity(w1,w2):
    doc1_set = set(w1)
    doc2_set = set(w2)
    intersection = doc1_set.intersection(doc2_set)
    union = doc1_set.union(doc2_set)
    return (len(intersection)/len(union))*100

In [121]:
for pair in pairwise:
    print(f"Pair: {pair}")
    print(f"Sim for JS: {jaccard_similarity(pair[0],pair[1])}")
    print("")

Pair: ('I', 'enjoys')
Sim for JS: 0.0

Pair: ('I', 'am')
Sim for JS: 0.0

Pair: ('I', 'mystic')
Sim for JS: 0.0

Pair: ('I', 'listening')
Sim for JS: 0.0

Pair: ('I', 'join')
Sim for JS: 0.0

Pair: ('I', 'music')
Sim for JS: 0.0

Pair: ('I', 'listen')
Sim for JS: 0.0

Pair: ('I', 'listened')
Sim for JS: 0.0

Pair: ('I', 'to')
Sim for JS: 0.0

Pair: ('I', 'enjoy')
Sim for JS: 0.0

Pair: ('enjoys', 'am')
Sim for JS: 0.0

Pair: ('enjoys', 'mystic')
Sim for JS: 20.0

Pair: ('enjoys', 'listening')
Sim for JS: 30.0

Pair: ('enjoys', 'join')
Sim for JS: 42.857142857142854

Pair: ('enjoys', 'music')
Sim for JS: 10.0

Pair: ('enjoys', 'listen')
Sim for JS: 33.33333333333333

Pair: ('enjoys', 'listened')
Sim for JS: 30.0

Pair: ('enjoys', 'to')
Sim for JS: 14.285714285714285

Pair: ('enjoys', 'enjoy')
Sim for JS: 83.33333333333334

Pair: ('am', 'mystic')
Sim for JS: 14.285714285714285

Pair: ('am', 'listening')
Sim for JS: 0.0

Pair: ('am', 'join')
Sim for JS: 0.0

Pair: ('am', 'music')
Sim for 

**If we use JS: Setting enjoys and join as well as enjoy and join as a baseline**

```
Pair: ('enjoys', 'join')
Sim for JS: 42.857142857142854

Pair: ('join', 'enjoy')
Sim for JS: 50.0
```

Maybe try this simple method first

Acceptability:
- N-best transcriptions shouldn't be too far off hopefully
- 

<br/>
<br/>
<br/>
<br/>
<br/>

## Research on Syntactic Similarity

Example input:

"I enjoys listening to music"

"I enjoy listen music"

"I join listening to music"

"I enjoy listened mystic"

"I am join listening music"

### Token Level Syntactic Similarity/Fuzzy Matching/Approximate string matching

1. Levenshtein Distance/Edit distance. Count the minimum number of single character edits (insertion, deletion, substitutions) required to change one word into another
- simple ratio, partial ratio (order matters), token sort ratio (don't care about order)
- library: the fuzz
- https://medium.com/@alphaiterations/fuzzy-matching-with-fuzzywuzzy-a-comprehensive-guide-04873f07de31
- https://www.datacamp.com/tutorial/fuzzy-string-python
- https://github.com/seatgeek/thefuzz

2. Hamming distance
- Count positions that don't match

3. Soundex: Assigns a 4 character code to each name, first character is the initial letter of the name and the next three characters represent the leading (first three) consonant sounds. Useful for matching names with different spellings but similar pronunciations. Designed for the english language. More suitable for names?
- library: jellyfish, phonetics
- good with short names/strings

4. Metaphone: Differences with soundex -> considers entire sound of the string instead of first few sounds
- library: phonetics

5. Double metaphone: Produces 2 encodings: One for the primary (most likely) pronunciation and one for an alternate (less common) pronunciation. Suitable for most english words, not just names. Basis for many spell checkers
- https://moj-analytical-services.github.io/splink/topic_guides/comparisons/phonetic.html

6. Use a combination with weightage
- https://stackabuse.com/phonetic-similarity-of-words-a-vectorized-approach-in-python/

<br/>

### Testing a combination of indel ratio for the original words and indel ratio for the phonetic representations of the words

In [37]:
!pip install rapidfuzz
!pip install jellyfish



In [45]:
import jellyfish
from rapidfuzz import fuzz

def similar_syntax(w1,w2,l1,threshold):
    """
    Uses the indel similarity between the words and the indel similarity between the approx phonetic representations of the words
    """
    sim = l1*fuzz.ratio(w1,w2) + (1-l1)*fuzz.ratio(jellyfish.metaphone(w1),jellyfish.metaphone(w2))
    print(f"The similarity is {sim}")
    if sim > threshold:
        return True
    return False

In [46]:
test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    if similar_syntax(test[0],test[1],0.5,35):
        print("The words are similar syntactically")
    else:
        print("The words are not similar syntactically")
    print("")


Test Case: am enjoy
The similarity is 0.0
The words are not similar syntactically

Test Case: join enjoy
The similarity is 42.22222222222222
The words are similar syntactically

Test Case: listen listening
The similarity is 80.0
The words are similar syntactically

Test Case: listen listened
The similarity is 87.3015873015873
The words are similar syntactically

Test Case: listened listening
The similarity is 71.65775401069519
The words are similar syntactically

Test Case: music mystic
The similarity is 79.22077922077924
The words are similar syntactically



<br/>

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

### Testing token level syntactic similarity via levenshtein and indel distance

In [20]:
!pip install thefuzz



In [21]:
# The Fuzz uses an indel similarity version 
# https://github.com/rapidfuzz/RapidFuzz/blob/main/api_differences.md
# Check out this instead: https://github.com/rapidfuzz/RapidFuzz
# https://github.com/rapidfuzz/RapidFuzz
# https://rapidfuzz.github.io/RapidFuzz/Usage/distance/Levenshtein.html


test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

from thefuzz import fuzz as the_fuzz
print("Indel Ratio test from thefuzz library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print(the_fuzz.ratio(test[0],test[1]))
    print(" ")


Indel Ratio test from thefuzz library 

Test Case: am enjoy
0
 
Test Case: join enjoy
44
 
Test Case: listen listening
80
 
Test Case: listen listened
86
 
Test Case: listened listening
71
 
Test Case: music mystic
73
 


In [22]:
!pip install rapidfuzz



In [23]:
from rapidfuzz import fuzz

test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

print("Indel Ratio test from rapidfuzz library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print(fuzz.ratio(test[0],test[1]))
    print(" ")

Indel Ratio test from rapidfuzz library 

Test Case: am enjoy
0.0
 
Test Case: join enjoy
44.44444444444444
 
Test Case: listen listening
80.0
 
Test Case: listen listened
85.71428571428572
 
Test Case: listened listening
70.58823529411764
 
Test Case: music mystic
72.72727272727273
 


In [24]:
from rapidfuzz.distance import Indel

test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

print("Indel Normalized Similarity test from rapidfuzz library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print(Indel.normalized_similarity(test[0],test[1])*100)
    print(" ")

Indel Normalized Similarity test from rapidfuzz library 

Test Case: am enjoy
0.0
 
Test Case: join enjoy
44.44444444444444
 
Test Case: listen listening
80.0
 
Test Case: listen listened
85.71428571428572
 
Test Case: listened listening
70.58823529411764
 
Test Case: music mystic
72.72727272727273
 


In [25]:
from rapidfuzz.distance import Levenshtein

test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

print("Original Levenshtein Normalized Similarity test from rapidfuzz library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print(Levenshtein.normalized_similarity(test[0],test[1])*100)
    print(" ")

Original Levenshtein Normalized Similarity test from rapidfuzz library 

Test Case: am enjoy
0.0
 
Test Case: join enjoy
19.999999999999996
 
Test Case: listen listening
66.66666666666667
 
Test Case: listen listened
75.0
 
Test Case: listened listening
66.66666666666667
 
Test Case: music mystic
66.66666666666667
 


In [26]:
print("Original Levenshtein Distance test from pyphonetics library \n")
from pyphonetics.distance_metrics import levenshtein_distance
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    ld = levenshtein_distance(test[0],test[1])
    print(ld)
    print(" ")

Original Levenshtein Distance test from pyphonetics library 

Test Case: am enjoy
5
 
Test Case: join enjoy
4
 
Test Case: listen listening
3
 
Test Case: listen listened
2
 
Test Case: listened listening
3
 
Test Case: music mystic
2
 


In [27]:
!pip install nltk



In [28]:
print("Original Levenshtein Distance test from nltk library \n")
import nltk
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    ld = nltk.edit_distance(test[0],test[1])
    print(ld)
    print(" ")

Original Levenshtein Distance test from nltk library 

Test Case: am enjoy
5
 
Test Case: join enjoy
4
 
Test Case: listen listening
3
 
Test Case: listen listened
2
 
Test Case: listened listening
3
 
Test Case: music mystic
2
 


<br/>
<br/>

### Testing token level syntactic similarity via calculating the original edit and indel distance between phonetic representations

In [29]:
!pip install pyphonetics



In [30]:
print("Soundex distance test from pyphonetics library")
from pyphonetics import Soundex
soundex = Soundex()
for test in test_cases:
    print(test[0] + " " + test[1])
    print(soundex.phonetics(test[0]) + " " + soundex.phonetics(test[1]))
    ld = soundex.distance(test[0], test[1], metric='levenshtein')
    print(f"Original Distance: {ld}")
    print(f"Indel Similarity Ratio: { fuzz.ratio(soundex.phonetics(test[0]),soundex.phonetics(test[1])) }")
    print(" ")

Soundex distance test from pyphonetics library
am enjoy
A500 E520
Original Distance: 2
Indel Similarity Ratio: 50.0
 
join enjoy
J500 E520
Original Distance: 2
Indel Similarity Ratio: 50.0
 
listen listening
L235 L235
Original Distance: 0
Indel Similarity Ratio: 100.0
 
listen listened
L235 L235
Original Distance: 0
Indel Similarity Ratio: 100.0
 
listened listening
L235 L235
Original Distance: 0
Indel Similarity Ratio: 100.0
 
music mystic
M220 M232
Original Distance: 2
Indel Similarity Ratio: 75.0
 


In [31]:
print("Metaphone levenshtein distance test from pyphonetics library")
from pyphonetics import Metaphone
metaphone = Metaphone()
for test in test_cases:
    print(test[0] + " " + test[1])
    print(metaphone.phonetics(test[0]) + " " + metaphone.phonetics(test[1]))
    ld = metaphone.distance(test[0], test[1], metric='levenshtein')
    print(f"Original Distance: {ld}")
    print(f"Indel Similarity Ratio: { fuzz.ratio(metaphone.phonetics(test[0]),metaphone.phonetics(test[1])) }")
    print(" ")

Metaphone levenshtein distance test from pyphonetics library
am enjoy
AM ENJ
Original Distance: 3
Indel Similarity Ratio: 0.0
 
join enjoy
JN ENJ
Original Distance: 2
Indel Similarity Ratio: 40.0
 
listen listening
LSTN LSTNNK
Original Distance: 2
Indel Similarity Ratio: 80.0
 
listen listened
LSTN LSTNT
Original Distance: 1
Indel Similarity Ratio: 88.88888888888889
 
listened listening
LSTNT LSTNNK
Original Distance: 2
Indel Similarity Ratio: 72.72727272727273
 
music mystic
MSK MSTK
Original Distance: 1
Indel Similarity Ratio: 85.71428571428572
 


In [32]:
print("Refined Soundex distance test from pyphonetics library")
from pyphonetics import RefinedSoundex
rs = RefinedSoundex()
for test in test_cases:
    print(test[0] + " " + test[1])
    print(rs.phonetics(test[0]) + " " + rs.phonetics(test[1]))
    ld = rs.distance(test[0], test[1], metric='levenshtein')
    print(f"Original Distance: {ld}")
    print(f"Indel Similarity Ratio: { fuzz.ratio(rs.phonetics(test[0]),rs.phonetics(test[1])) }")
    print(" ")

Refined Soundex distance test from pyphonetics library
am enjoy
A08 E0840
Original Distance: 3
Indel Similarity Ratio: 50.0
 
join enjoy
J408 E0840
Original Distance: 4
Indel Similarity Ratio: 44.44444444444444
 
listen listening
L703608 L703608084
Original Distance: 3
Indel Similarity Ratio: 82.35294117647058
 
listen listened
L703608 L70360806
Original Distance: 2
Indel Similarity Ratio: 87.5
 
listened listening
L70360806 L703608084
Original Distance: 2
Indel Similarity Ratio: 84.21052631578947
 
music mystic
M80303 M803603
Original Distance: 1
Indel Similarity Ratio: 92.3076923076923
 


<br/>
<br/>
<br/>

### Testing token level syntactic similarity via phonetics

In [33]:
!pip install phonetics
!pip install pyphonetics
!pip install jellyfish



In [34]:
test_cases = [('am','enjoy'),('join','enjoy'),('listen','listening'),('listen','listened'),('listened','listening'),('music','mystic')]

import phonetics
from pyphonetics import Soundex
import jellyfish
soundex = Soundex()
print("Soundex test from phonetics library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print("From phonetics library")
    print(phonetics.soundex(test[0]) + " " + phonetics.soundex(test[1]))
    print("From pyphonetics library")
    print(soundex.phonetics(test[0]) + " " + soundex.phonetics(test[1]))
    print("From jellyfish library")
    print(jellyfish.soundex(test[0]) + " " + jellyfish.soundex(test[1]))
    print(" ")


print("#####################################################################")

from pyphonetics import Metaphone
metaphone = Metaphone()
print("Metaphone test from phonetics library \n")
for test in test_cases:
    print("Test Case: " + test[0] + " " + test[1])
    print("From phonetics library")
    print(phonetics.metaphone(test[0]) + " " + phonetics.metaphone(test[1]))
    print("From pyphonetics library")
    print(metaphone.phonetics(test[0]) + " " + metaphone.phonetics(test[1]))
    print("From jellyfish library")
    print(jellyfish.metaphone(test[0]) + " " + jellyfish.metaphone(test[1]))
    print(" ")

print("#####################################################################")

Soundex test from phonetics library 

Test Case: am enjoy
From phonetics library
a500 e520
From pyphonetics library
A500 E520
From jellyfish library
A500 E520
 
Test Case: join enjoy
From phonetics library
j050 e520
From pyphonetics library
J500 E520
From jellyfish library
J500 E520
 
Test Case: listen listening
From phonetics library
l02305 l02305052
From pyphonetics library
L235 L235
From jellyfish library
L235 L235
 
Test Case: listen listened
From phonetics library
l02305 l0230503
From pyphonetics library
L235 L235
From jellyfish library
L235 L235
 
Test Case: listened listening
From phonetics library
l0230503 l02305052
From pyphonetics library
L235 L235
From jellyfish library
L235 L235
 
Test Case: music mystic
From phonetics library
m0202 m02302
From pyphonetics library
M220 M232
From jellyfish library
M220 M232
 
#####################################################################
Metaphone test from phonetics library 

Test Case: am enjoy
From phonetics library
AM ANJ
From pyp

<br/>

In [35]:
print("RefinedSoundex test from pyphonetics library")
from pyphonetics import RefinedSoundex
rs = RefinedSoundex()
for test in test_cases:
    print(test[0] + " " + test[1])
    print("From pyphonetics library")
    print(rs.phonetics(test[0]) + " " + rs.phonetics(test[1]))
    print(f"Distance: {rs.distance(test[0], test[1])}")
    print(" ")

RefinedSoundex test from pyphonetics library
am enjoy
From pyphonetics library
A08 E0840
Distance: 3
 
join enjoy
From pyphonetics library
J408 E0840
Distance: 4
 
listen listening
From pyphonetics library
L703608 L703608084
Distance: 3
 
listen listened
From pyphonetics library
L703608 L70360806
Distance: 2
 
listened listening
From pyphonetics library
L70360806 L703608084
Distance: 2
 
music mystic
From pyphonetics library
M80303 M803603
Distance: 1
 


In [36]:
print("FuzzySoundex test from pyphonetics library")
from pyphonetics import FuzzySoundex
fs = FuzzySoundex()
for test in test_cases:
    print(test[0] + " " + test[1])
    print("From pyphonetics library")
    print(fs.phonetics(test[0]) + " " + fs.phonetics(test[1]))
    print(f"Distance: {fs.distance(test[0], test[1])}")
    print(" ")

FuzzySoundex test from pyphonetics library
am enjoy
From pyphonetics library
A5 E57
Distance: 2
 
join enjoy
From pyphonetics library
J5 E57
Distance: 2
 
listen listening
From pyphonetics library
L935 L93557
Distance: 2
 
listen listened
From pyphonetics library
L935 L9353
Distance: 1
 
listened listening
From pyphonetics library
L9353 L93557
Distance: 2
 
music mystic
From pyphonetics library
M99 M939
Distance: 1
 


<br/>
<br/>
<br/>

<br/>
<br/>
<br/>

```
Deprecated Code
def align_new_seq(results,new_seq):
    # Get the intersection of the sequences i.e. only the tokens and no padding tokens
    seq_intersection = []
    # For each column in the aligned sequences in the results,
    # get the non-padding token, which is the same throughout the column
    for col in zip(*results):
        tokens = [t for t in col if t != 'Pad']
        seq_intersection.append(tokens[0])

    print("Results")
    for res in results:
        print(res)
    print("")


    print("Sequence Intersection: ")
    print(seq_intersection)
    print("")

    print("New Sequence:")
    print(new_seq)
    print("")

    # Align the new sequence with the sequence intersection. This ensures that we align the new sequence
    # with all other sequences in the results
    seq_intersection_aligned, new_seq_aligned = force_align(seq_intersection,new_seq)

    print("Sequence Intersection aligned: ")
    print(seq_intersection_aligned)
    print("")

    print("New Sequence Aligned:")
    print(new_seq_aligned)
    print("")

    # Initialise the column pointer
    col_pointer = 0
    # Initialise the newly aligned results
    aligned_results = [[] for _ in range(len(results))] 

    # For each token in the aligned sequence intersection
    for k in range(len(seq_intersection_aligned)):
        # Check if it at least one of the tokens in the column of the result matches the current sequence intersection token
        aligned = False
        # For each result, check if its value in the current column matches the current sequence intersection token
        for i in range(len(results)): 
            if results[i][col_pointer]==seq_intersection_aligned[k]:
                # This means that there is no new padding in the modified sequence intersection at this position,
                # as it still uses the tokens from our aligned sequence
                aligned = True 
                break
        # If there was an alignment with the modified sequence intersection at this position
        if aligned:
             # Add the current tokens in the column to the newly aligned results and 
             # advance the column pointer to compare the next column
             for i in range(len(results)):
                 aligned_results[i].append(results[i][col_pointer])
             col_pointer+=1
        # If there was no alignment with the modified sequence intersection at this position
        # it means we have to add a padding token
        else:
            for i in range(len(results)):
                 aligned_results[i].append('Pad')

    
    aligned_results.append(new_seq_aligned)

    print("Aligned Results")
    for res in aligned_results:
        print(res)

    print("")
    print("")
    print("")
    print("")
    print("")

    return aligned_results


def force_align_n_seqs(seqs):
    # First force align 2 sequences
    seq1,seq2 = force_align(seqs[0],seqs[1])

    result = [seq1,seq2]

    # Next, force align the sequences progressively
    for seq in seqs[2:]:
        result = align_new_seq(result,seq)
        
    return result
```

<br/>
<br/>
<br/>

<br/>
<br/>
<br/>