In [1]:
import copy
import numpy as np
import pandas as pd

# Week 1: Introduction to Read Mapping

In [2]:
'''
Code Challenge: Solve the Trie Construction Problem.
Input: A collection of strings Patterns.
Output: The adjacency list corresponding to Trie(Patterns), in the following format. If Trie(Patterns) has n nodes, first label the
     root with 0 and then label the remaining nodes with the integers 1 through n - 1 in any order you like. Each edge of the
     adjacency list of Trie(Patterns) will be encoded by a triple: the first two members of the triple must be the integers labeling the
     initial and terminal nodes of the edge, respectively; the third member of the triple must be the symbol labeling the edge.
'''

def Strings_2_Trie(strings):
    if type(strings) == str:
        strings = strings.split('\n')
    
    added_strings = dict()
    trie   = dict()
    
    # add the first string in to trie
    string = strings[0]
    for position in range(len(string)):
        base           = string[position]
        trie[position] = {position + 1 : base}
        max_label     =  position + 2
    added_strings[strings[0]] = len(strings[0]) - 1
    
    for string in strings[1:]:
        label_delta   = 0
        start_position = 0
        jump_position  = 0
        
        # find the longest prefix of string which already in trie
        max_len = 0
        prefix_string = ''
        for added_string in added_strings.keys():
            for i in range(1, len(added_string) + 1):
                s = added_string[:i]
                if s == string[:len(s)]:
                    if i > max_len:
                        max_len       = i
                        prefix_string = s
               
        current_node = 0
        if prefix_string != '':
            for i in range(len(prefix_string)):
                for node, base in trie[current_node].items():
                    if base == prefix_string[i]:
                        current_node  = node
                        jump_position = node
                        break
                        
        # start to add the base which do not exist in trie                
        start_position = len(prefix_string)
        
        # jump_position is to control if the new string have any prefix in trie
        # label_delta is to contral the new node label
        
        for position in range(start_position, len(string)):
            base  = string[position]
            label = position + label_delta + jump_position - start_position
            
            if label in trie:
                
                if (label + 1) in trie[label]:
                    
                    if trie[label][label + 1] == base:
                        continue
                        
                    else:
                        trie[label][max_label] = base
                        if (max_label - label != 1):
                            label_delta = max_label - label - 1
                        max_label   = max_label + 1
                        if position == len(string) - 1:
                            added_strings[string] = position + label_delta + jump_position - start_position + 1
                else:
                    trie[label] = {max_label : base}
                    if (max_label - label != 1):
                        label_delta = max_label - label - 1
                    max_label   = max_label + 1
                    if position == len(string) - 1:
                        added_strings[string] = position + label_delta + jump_position - start_position + 1
            else:
                trie[label] = {max_label : base}
                if (max_label - label != 1):
                    label_delta = max_label - label - 1
                max_label   = max_label + 1
                if position == len(string) - 1:
                    added_strings[string] = position + label_delta + jump_position - start_position + 1
                    
    return(trie)

# Test
strings = '''ATAGA
ATC
GAT
ATCIO
GATUYEWQ
GAAT'''

graph = Strings_2_Trie(strings)
for key1, values1 in graph.items():
    for key2, value2 in values1.items():    
        print(str(key1) + '->' + str(key2) + ':' + value2)

0->1:A
0->7:G
1->2:T
2->3:A
2->6:C
3->4:G
4->5:A
7->8:A
8->9:T
8->17:A
6->10:I
10->11:O
9->12:U
12->13:Y
13->14:E
14->15:W
15->16:Q
17->18:T


In [3]:
'''
Code Challenge: Implement TrieMatching to solve the Multiple Pattern Matching Problem.
Input: A string Text and a collection of strings Patterns.
Output: All starting positions in Text where a string from Patterns appears as a substring.
'''

def Prefix_Trie_Matching(text, trie):
    node  = 0
    index = 0

    while index != len(text):
        base = text[index]
        if node in trie:
            check = 0
            for n, b in trie[node].items():
                if b == base:
                    node  = n
                    index = index + 1
                    check = 1
                    break
            if check == 0:
                return(0)      
        else: 
            return(1)
        
    return(1)

def Trie_Matching(text, patterns):
    if type(patterns) == str:
        trie = Strings_2_Trie(patterns)
    
    positions = []
    for i in range(len(text) - 1):
        check = Prefix_Trie_Matching(text[i:], trie)
        if check == 1:
            positions.append(i)
    
    return(positions)

# Test
text = 'AATCGGGTTCAATCGGGGT'
patterns = '''ATCG
GGGT'''

Trie_Matching(text, patterns)

[1, 4, 11, 15]

In [4]:
'''
Code Challenge: Solve the Suffix Tree Construction Problem.
Input: A string Text.
Output: The edge labels of SuffixTree(Text). You may return these strings in any order.
'''

def Suffix_Trie_Construction(text):
    trie      = {}
    positions = {}
    new_node  = 1
    
    for i in range(len(text)):
        current_node = 0
        for j in range(i, len(text)):
            current_symbol = text[j]
            
            check = 0
            if current_node in trie:
                for node, symbol_position in trie[current_node].items():
                    if symbol_position[0] == current_symbol:
                        current_node = node
                        check        = 1
                        break            
            if check == 0:
                if current_node in trie:
                    trie[current_node][new_node] = [current_symbol, j]
                    #current_node = new_node
                    #new_node     = new_node + 1
                else:
                    trie[current_node] = {new_node : [current_symbol, j]}
                current_node = new_node
                new_node     = new_node + 1
                
        if current_node not in trie:
            positions[current_node] = i
    
    return(trie, positions)
                

def CountInOut(graph):
    in_count  = {}
    out_count = {}
    
    for key_1, values_1 in graph.items():
        out_count[key_1] = len(values_1)
        
        for key_2, values_2 in values_1.items():
            in_count[key_2] = 1
            
    in_count[0] = 0
    for key in in_count.keys():
        if key not in out_count:
            out_count[key] = 0
            
    return(in_count, out_count)

def Maximal_NonBranching_Paths(graph):
    paths = []
    
    in_count, out_count = CountInOut(graph)
    
    for node in in_count.keys():
        if (in_count.get(node) != 1) | (out_count.get(node) != 1):
            if out_count[node] > 0:
                for n, base in graph[node].items():
                    non_branching_path = base[0]
                    next_node = n
                    
                    while (in_count[next_node] == out_count[next_node] == 1):
                        non_branching_path = non_branching_path + list(graph[next_node].values())[0][0]
                        next_node          = list(graph[next_node].keys())[0]
                    paths.append(non_branching_path)
    return(paths)
# Test
text = 'ATATCGTTTTATCGTT$'

graph, positions = Suffix_Trie_Construction(text)
Maximal_NonBranching_Paths(graph)

['ATCGTT',
 'CGTT',
 'T',
 '$',
 'ATCGTTTTATCGTT$',
 'CGTT',
 'T',
 'ATCGTT$',
 '$',
 'TTATCGTT$',
 '$',
 'TTATCGTT$',
 '$',
 'TTATCGTT$',
 '$',
 'TTATCGTT$',
 '$',
 'TTATCGTT$',
 '$',
 'TATCGTT$',
 'ATCGTT$',
 'AT',
 'T',
 'CGTT',
 'GTT',
 '$']

In [5]:
def Modified_Suffix_Trie_Construction(text):
    trie      = {}
    positions = {}
    new_node  = 1
    
    for i in range(len(text)):
        current_node = 0
        for j in range(i, len(text)):
            current_symbol = text[j]
            
            check = 0
            if current_node in trie:
                for node, symbol_position in trie[current_node].items():
                    if symbol_position[0] == current_symbol:
                        current_node = node
                        check        = 1
                        break            
            if check == 0:
                if current_node in trie:
                    trie[current_node][new_node] = [current_symbol, j]

                else:
                    trie[current_node] = {new_node : [current_symbol, j]}
                current_node = new_node
                new_node     = new_node + 1
                
        if current_node not in trie:
            positions[current_node] = i
    
    paths = []
    
    in_count, out_count = CountInOut(trie)
    
    for node in in_count.keys():
        if (in_count.get(node) != 1) | (out_count.get(node) != 1):
            if out_count[node] > 0:
                
                for n, base in trie[node].items():
                    non_branching_path = base[0]
                    next_node = n
                    
                    while (in_count[next_node] == out_count[next_node] == 1):
                        non_branching_path = non_branching_path + list(trie[next_node].values())[0][0]
                        next_node_tmp      = list(trie[next_node].keys())[0]
                        length             = list(trie[next_node].values())[0][1]
                        trie.pop(next_node)
                        next_node          = next_node_tmp

                    if n!= next_node:
                        if n in trie:
                            trie[n][next_node] = [non_branching_path, base[1],length + 1 - base[1]]
                        else:
                            trie[n] = {next_node : [non_branching_path, base[1],length + 1 - base[1]]}

                    paths.append(non_branching_path)

    tmp_trie = {}
    
    for node_1, key_1 in trie.items():
        if len(key_1) != 1: 
            tmp_trie[node_1] = copy.deepcopy(key_1)

    for node_1, key_1 in tmp_trie.items():
        for node_2, key_2 in key_1.items():

            if node_2 in trie:
                if len(trie[node_2]) == 1:
                    trie[node_1].pop(node_2)
                    trie[node_1][list(trie[node_2].keys())[0]] = list(trie[node_2].values())[0]
                    trie.pop(node_2)
    
    return(trie)

In [6]:
'''
Longest Repeat Problem: Find the longest repeat in a string.
Input: A string Text.
Output: A longest substring of Text that appears in Text more than once.

Code Challenge: Solve the Longest Repeat Problem. (Multiple solutions may exist, in which case you may return any one.)
'''


def Longest_Repeat_in_String(string):
    trie = Modified_Suffix_Trie_Construction(string + '$')

    end_nodes      = []
    backward_graph = {}
    repeat_strings = []
    
    # find all the node have at least two sub-node, and make a backward graph for them
    for key_1, value_1 in trie.items():
        for key_2,value_2 in value_1.items():
            if key_2 in trie:
                if len(trie[key_2]) > 1:
                    end_nodes.append(key_2)
                    backward_graph[key_2] = [key_1, value_2[0]]
    
    # backtrack
    for node in backward_graph.keys():
        repeat_string = ''
        while node != 0:
            repeat_string = backward_graph[node][1] + repeat_string
            node          = backward_graph[node][0]
            repeat_strings.append(repeat_string)
    
    # find the longest repeat string
    max_len = -1
    for string in repeat_strings:
        string_len = len(string)
        if string_len > max_len:
            max_len        = string_len
            longest_string = string


    return(longest_string)

#Test
string = 'ATATCGTTTTATCGTT'

Longest_Repeat_in_String(string)

'TATCGTT'

In [7]:
'''
Longest Shared Substring Problem: Find the longest substring shared by two strings.
Input: Strings Text1 and Text2.
Output: The longest substring that occurs in both Text1 and Text2.

Code Challenge: Solve the Longest Shared Substring Problem. (Multiple solutions may exist, in which case you may return any one.)
'''

def Longest_Shared_Substring(string_1, string_2):
    text = string_1 + '#' + string_2 + '$'
    trie = Modified_Suffix_Trie_Construction(text)
    in_count, out_count = CountInOut(trie)
    classification = {} # 1 for string_1, 2 for string_2, 3 for both
    
    # classify each node from leaf
    while len(in_count) != len(classification):
        for key_1, value_1 in trie.items():
            if key_1 not in classification:
                nodes_class = []
                for key_2, value_2 in value_1.items():
                    if '$' in value_2[0]:
                        if '#' in value_2[0]:
                            classification[key_2] = 1
                        else:
                            classification[key_2] = 2
                    nodes_class.append(classification.get(key_2, -1))
                if -1 not in nodes_class:
                    if 3 in nodes_class:
                        classification[key_1] = 3
                        
                    elif (1 in nodes_class) & (2 in nodes_class):
                        classification[key_1] = 3
                        
                    else:
                        classification[key_1] = nodes_class[0]
    
    # make a backward graph for shared_edge, which if good to construct shared path
    shared_edge_back = {}
    for key_1, values_1 in trie.items():
        if classification[key_1] == 3:
            for key_2, values_2 in values_1.items():
                if classification[key_2] == 3:
                    shared_edge_back[key_2] = [key_1, values_2[0]]
    
    # find all shared path
    shared_paths = []
    for key in shared_edge_back.keys():
        shared_path = ''
        node = key
        while node != 0:
            pattern = shared_edge_back[node][1]
            shared_path = pattern + shared_path
            node = shared_edge_back[node][0]
            
        shared_paths.append(shared_path)
            
    # find the longest shared path
    max_len = -1
    for string in shared_paths:
        string_len = len(string)
        if string_len > max_len:
            max_len        = string_len
            longest_string = string
            
    return(longest_string)

# Test
string_1 = 'TCGGTAGATTGCGCCCACTC'
string_2 = 'AGGGGCTCGCAGTGTAAGAA'

Longest_Shared_Substring(string_1, string_2)

'TCG'

In [8]:
'''
Shortest Non-Shared Substring Problem: Find the shortest substring of one string that does not appear in another string.
Input: Strings Text1 and Text2.
Output: The shortest substring of Text1 that does not appear in Text2.

Code Challenge: Solve the Shortest Non-Shared Substring Problem. (Multiple solutions may exist, in which case you may return any one.)
'''

def Shortest_Non_Shared_Substring(string_1, string_2):
    
    trie = Suffix_Trie_Construction(string_2)[0]
    min_len = float('Inf')

    for i in range(len(string_1)):
        string     = string_1[i:]
        tmp_string = copy.deepcopy(string)
        
        node   = 0
        length = 0
        check  = 1

        while (node in trie) & (check == 1):
            check = 0

            for key, value in trie[node].items():

                if len(tmp_string) > 0:

                    if value[0] == tmp_string[0]:
                        length = length + 1
                        node   = key
                        tmp_string = tmp_string[1:]
                        check = 1
                    
        if len(string) > length:
            if length < min_len:
                min_len = length
                non_shared_string = string[ : length + 1 ]
            
                
    return(non_shared_string)

# Test
string_1 = 'CCAAGCTGCTAGAGG'
string_2 = 'CATGCTGGGCTGGCT'

Shortest_Non_Shared_Substring(string_1, string_2)

'CC'

# Week 2: The Burrows-Wheeler Transform

In [9]:
'''
Suffix Array Construction Problem: Construct the suffix array of a string.
Input: A string Text.
Output: SuffixArray(Text).

Code Challenge: Solve the Suffix Array Construction Problem.
'''

def Suffix_Array_Construction(string):
    char_indexs = []
    indexs      = []
    for i in range(len(string)):
        char_indexs.append([string[i:], i])
        
    char_indexs.sort()
    
    for char_index in char_indexs:
        indexs.append(char_index[1])
        
    return(indexs)

string = 'AACGATAGCGGTAGA$'

Suffix_Array_Construction(string)

[15, 14, 0, 1, 12, 6, 4, 2, 8, 13, 3, 7, 9, 10, 11, 5]

In [10]:
'''
Burrows-Wheeler Transform Construction Problem: Construct the Burrows-Wheeler transform of a string.
Input: A string Text.
Output: BWT(Text).

Code Challenge: Solve the Burrows-Wheeler Transform Construction Problem.
'''

def BWT_Construction(string):
    string_rotation = []
    BWT = ''
    
    for i in range(len(string)):
        string = string[-1] + string[:-1]
        string_rotation.append(string)
    
    string_rotation.sort()
    
    for characters in string_rotation:
        BWT = BWT + characters[-1]
    
    return(BWT)

# Test
string = 'GCGTGCCTGGTCA$'

BWT_Construction(string)

'ACTGGCT$TGCGGC'

In [11]:
'''
Inverse Burrows-Wheeler Transform Problem: Reconstruct a string from its Burrows-Wheeler transform.
Input: A string Transform (with a single "$" symbol).
Output: The string Text such that BWT(Text) = Transform.

Code Challenge: Solve the Inverse Burrows-Wheeler Transform Problem.
'''

def Inverse_BWT(BWT_string):

    BWT_string = list(BWT_string)
    first_col  = copy.deepcopy(BWT_string)
    first_col.sort()
    
    inverse_BWT_list = []

    for base in "ACGT":
        count = 0
        for index in range(len(BWT_string)):
            if BWT_string[index] == base:
                BWT_string[index] = base + str(count)
                count = count + 1
    
    for base in "ACGT":
        count = 0
        for index in range(len(first_col)):
            if first_col[index] == base:
                first_col[index] = base + str(count)
                count = count + 1
                
    char = '$'
    inverse_BWT_list.append(char)

    while len(inverse_BWT_list) != len(BWT_string):
        index = BWT_string.index(char)
        char  = first_col[index]
        inverse_BWT_list.append(char)

    inverse_BWT_list = inverse_BWT_list[1:] + list(inverse_BWT_list[0])
    inverse_BWT_list = ''.join(inverse_BWT_list)
    inverse_BWT = ''
    for char in inverse_BWT_list:
        if not char.isdigit():
            inverse_BWT = inverse_BWT + char
    
    return(inverse_BWT)

# Test
BWT_string = 'TTCCTAACG$A'

Inverse_BWT(BWT_string)

'TACATCACGT$'

In [12]:
'''
Code Challenge: Implement BWMatching.
Input: A string BWT(Text), followed by a collection of Patterns.
Output: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member of Patterns in Text.
'''

def BW_Matching(BWT_string, patterns):
    
    if type(patterns) == str:
        patterns = patterns.split(' ')
  
    last_col = list(BWT_string)
    first_col  = copy.deepcopy(last_col)
    first_col.sort()

    for base in "ACGT":
        count = 0
        for index in range(len(last_col)):
            if last_col[index] == base:
                last_col[index] = base + str(count)
                count = count + 1

    for base in "ACGT":
        count = 0
        for index in range(len(first_col)):
            if first_col[index] == base:
                first_col[index] = base + str(count)
                count = count + 1

    n_matchs = []
    
    for pattern in patterns:

        top    = 0
        bottom = len(last_col) - 1
        
        while True:

            if len(pattern) > 0:
                symbol  = pattern[-1]
                pattern = pattern[:-1]

                top_index = float('Inf')
                for index in range(top, bottom + 1):
                    current_symbol = last_col[index][0]

                    if current_symbol == symbol:
                        if index < top_index:
                            top_index = index
                        bottom_index = index
                        
                if top_index == float('Inf'): # if there is no match
                    n_matchs.append(0)
                    break
                    
                else: 
                    top    = first_col.index(last_col[top_index])
                    bottom = first_col.index(last_col[bottom_index])
            else:
                mathchs = bottom - top + 1
                n_matchs.append(mathchs)
                break
                
    return(n_matchs)

# Test
BWT_string = 'TCCTCTATGAGATCCTATTCTATGAAACCTTCA$GACCAAAATTCTCCGGC'
patterns = 'CCT CAC GAG CAG ATC'

BW_Matching(BWT_string, patterns)

[2, 1, 1, 0, 1]

# Week 3: Speeding Up Burrows-Wheeler Read Mapping

In [13]:
def Inverse_BWT_number(BWT_string):

    BWT_string = list(BWT_string)
    first_col  = copy.deepcopy(BWT_string)
    first_col.sort()
    
    inverse_BWT_list = []

    for base in "ACGT":
        count = 0
        for index in range(len(BWT_string)):
            if BWT_string[index] == base:
                BWT_string[index] = base + str(count)
                count = count + 1
    
    for base in "ACGT":
        count = 0
        for index in range(len(first_col)):
            if first_col[index] == base:
                first_col[index] = base + str(count)
                count = count + 1
                
    char = '$'
    inverse_BWT_list.append(char)

    while len(inverse_BWT_list) != len(BWT_string):
        index = BWT_string.index(char)
        char  = first_col[index]
        inverse_BWT_list.append(char)

    inverse_BWT_list = inverse_BWT_list[1:] + list(inverse_BWT_list[0])
    
    return(inverse_BWT_list)

In [14]:
'''
Code Challenge: Solve the Multiple Pattern Matching Problem.
Input: A string Text followed by a collection of strings Patterns.
Output: All starting positions in Text where a string from Patterns appears as a substring.
'''

def BW_Matching_position(string, patterns):
    
    if type(patterns) == str:
        patterns = patterns.split(' ')
    
    string = string + '$'
    BWT_string = BWT_Construction(string)

    inverse_BWT_list = Inverse_BWT_number(BWT_string)
    
    last_col  = list(BWT_string)
    first_col = copy.deepcopy(last_col)
    first_col.sort()
    
    inverse_BWT_list = Inverse_BWT_number(BWT_string)
    
    for base in "ACGT":
        count = 0
        for index in range(len(last_col)):
            if last_col[index] == base:
                last_col[index] = base + str(count)
                count = count + 1

    for base in "ACGT":
        count = 0
        for index in range(len(first_col)):
            if first_col[index] == base:
                first_col[index] = base + str(count)
                count = count + 1

    positions = []
    
    for pattern in patterns:

        top    = 0
        bottom = len(last_col) - 1
        
        while True:

            if len(pattern) > 0:
                symbol  = pattern[-1]
                pattern = pattern[:-1]

                top_index = float('Inf')
                for index in range(top, bottom + 1):
                    current_symbol = last_col[index][0]

                    if current_symbol == symbol:
                        if index < top_index:
                            top_index = index
                        bottom_index = index
                        
                if top_index == float('Inf'): # if there is no match

                    break
                    
                else: 
                    top    = first_col.index(last_col[top_index])
                    bottom = first_col.index(last_col[bottom_index])
            else:
                
                for i in range(top, bottom + 1):
                    
                    position = inverse_BWT_list.index(first_col[i])
                    positions.append(position)
                    
                break
                
    return(positions)


# Test
string = 'AATCGGGTTCAATCGGGGT'
patterns = 'ATCG GGGT'

BW_Matching_position(string, patterns)

[11, 1, 15, 4]

In [15]:
'''
Code Challenge: Solve the Multiple Approximate Pattern Matching Problem.
Input: A string Text, followed by a collection of strings Patterns, and an integer d.
Output: All positions where one of the strings in Patterns appears as a substring of Text with at most d mismatches.
'''

def BW_Matching_d_Mismatch(string, patterns, d):
    
    if type(patterns) == str:
        patterns = patterns.split(' ')

    string = string + '$'
    BWT_string = BWT_Construction(string)

    inverse_BWT_list = Inverse_BWT_number(BWT_string)

    last_col = list(BWT_string)
    first_col  = copy.deepcopy(last_col)
    first_col.sort()

    inverse_BWT_list = Inverse_BWT_number(BWT_string)

    for base in "ACGT":
        count = 0
        for index in range(len(last_col)):
            if last_col[index] == base:
                last_col[index] = base + str(count)
                count = count + 1

    for base in "ACGT":
        count = 0
        for index in range(len(first_col)):
            if first_col[index] == base:
                first_col[index] = base + str(count)
                count = count + 1

    positions = []

    for pattern in patterns:

        candidate_first_col = list(range(len(first_col)))

        n_mistake = {}
        
        for candidate in candidate_first_col:
            n_mistake[candidate] = 0

        while True:

            if len(pattern) > 1:

                symbol  = pattern[-1]
                pattern = pattern[:-1]

                next_candidate_first_col = []
                next_n_mistake           = {}

                for index in candidate_first_col:

                    current_symbol = first_col[index][0]

                    if current_symbol == symbol:
                        next_index = first_col.index(last_col[index])
                        next_candidate_first_col.append(next_index)
                        next_n_mistake[next_index] = int(n_mistake[index])

                    else:

                        n_mistake[index] = n_mistake[index] + 1
                        if n_mistake[index] <= d:
                            next_index = first_col.index(last_col[index])
                            next_candidate_first_col.append(next_index)
                            next_n_mistake[next_index] = int(n_mistake[index])

                candidate_first_col = next_candidate_first_col
                n_mistake           = next_n_mistake

            elif len(pattern) == 1:
                candidate_first_col = next_candidate_first_col
                n_mistake           = next_n_mistake

                symbol  = pattern[-1]
                pattern = pattern[:-1]

                next_candidate_first_col = []
                next_n_mistake           = {}

                for index in candidate_first_col:

                    current_symbol = first_col[index][0]

                    if current_symbol == symbol:

                        next_candidate_first_col.append(index)
                        next_n_mistake[next_index] = int(n_mistake[index])

                    else:

                        n_mistake[index] = n_mistake[index] + 1
                        if n_mistake[index] <= d:

                            next_candidate_first_col.append(index)
                            next_n_mistake[next_index] = int(n_mistake[index])

            else:

                for i in next_candidate_first_col:
                    
                    position = inverse_BWT_list.index(first_col[i])
                    positions.append(position)
                    
                break

    return(positions)

# Test
string   = 'ACATGCTACTTT'
patterns = 'ATT GCC GCTA TATT'
d = 1

BW_Matching_d_Mismatch(string, patterns, d)

[2, 9, 8, 7, 4, 4, 6]

# Week 4: Introduction to Hidden Markov Models

In [16]:
'''
CODE CHALLENGE: Solve the Probability of a Hidden Path Problem.
Given: A hidden path π followed by the states States and transition matrix Transition of an HMM
     (Σ, States, Transition, Emission).
Return: The probability of this path, Pr(π).

Note: You may assume that transitions from the initial state occur with equal probability.
'''

def Probability_of_Hidden_Path(hidden_path, states, transition_matrix):
    
    if type(states) == str:
        states = states.split(' ')

    transition_matrix = transition_matrix.split('\n')

    for i in range(len(transition_matrix)):
        transition_matrix[i] = transition_matrix[i].split('\t')
        transition_matrix[i] = list(map(float, transition_matrix[i]))

    transition_matrix = pd.DataFrame(transition_matrix, columns = states, index = states)

    probability = 0.5
    for i in range(len(hidden_path) - 1):
        current_state = hidden_path[i]
        next_state    = hidden_path[i + 1]
        probability   = probability * transition_matrix.at[current_state,next_state]

    return(probability)

# Test
hidden_path = 'ABABBBAAAA'
states = 'A B'
transition_matrix = '''0.377	0.623
0.26	0.74'''

Probability_of_Hidden_Path(hidden_path, states, transition_matrix)

0.00038492869175467582

In [17]:
'''
CODE CHALLENGE: Solve the Probability of an Outcome Given a Hidden Path Problem.
Input: A string x, followed by the alphabet from which x was constructed, followed by
     a hidden path π, followed by the states States and emission matrix Emission of an HMM
     (Σ, States, Transition, Emission).
Output: The conditional probability Pr(x|π) that x will be emitted given that the HMM
     follows the hidden path π.

Note: You may assume that transitions from the initial state occur with equal probability.
'''

def Probability_of_x_by_Hidden_Path(string_x, xs, hidden_path, states, states_emission_matrix):
    
    if type(xs) == str:
        xs = xs.split(' ')
        
    if type(states) == str:
        states = states.split(' ')
        
    states_emission_matrix = states_emission_matrix.split('\n')

    for i in range(len(states_emission_matrix)):
        states_emission_matrix[i] = states_emission_matrix[i].split('\t')
        states_emission_matrix[i] = list(map(float, states_emission_matrix[i]))
        
    states_emission_matrix = pd.DataFrame(states_emission_matrix, columns = xs, index = states)
    
    probability = 1
    for i in range(len(string_x)):
        x     = string_x[i]
        state = hidden_path[i]

        probability = probability * states_emission_matrix.at[state, x]
    
    return(probability)

# Test
string_x = 'zzzyxyyzzx'
xs = 'x y z'
hidden_path = 'BAAAAAAAAA'
states = 'A B'
states_emission_matrix = '''0.176	0.596	0.228
0.225	0.572	0.203'''

Probability_of_x_by_Hidden_Path(string_x, xs, hidden_path, states, states_emission_matrix)

3.5974895474624624e-06

In [18]:
'''
CODE CHALLENGE: Implement the Viterbi algorithm solving the Decoding Problem.
Input: A string x, followed by the alphabet from which x was constructed,
     followed by the states States, transition matrix Transition, and emission matrix
     Emission of an HMM (Σ, States, Transition, Emission).
Output: A path that maximizes the (unconditional) probability Pr(x, π) over all possible paths π.

Note: You may assume that transitions from the initial state occur with equal probability.
'''

def Viterbi_algorithm(string_x, xs, states, transition_matrix, states_emission_matrix):
    
    # formating all parameters
    if type(xs) == str:
        xs = xs.split(' ')
        
    if type(states) == str:
        states = states.split(' ')
    
    transition_matrix      = transition_matrix.split('\n')
    for i in range(len(transition_matrix)):
        transition_matrix[i] = transition_matrix[i].split('\t')
        transition_matrix[i] = list(map(float, transition_matrix[i]))
        
    transition_matrix = pd.DataFrame(transition_matrix, columns = states, index = states)
    
    states_emission_matrix = states_emission_matrix.split('\n')
    for i in range(len(states_emission_matrix)):
        states_emission_matrix[i] = states_emission_matrix[i].split('\t')
        states_emission_matrix[i] = list(map(float, states_emission_matrix[i]))
        
    states_emission_matrix = pd.DataFrame(states_emission_matrix, columns = xs, index = states)
    
    probability_matrix = np.full([len(states), len(string_x)], 0, float)
    probability_matrix = pd.DataFrame(probability_matrix, columns = list(range(len(string_x))), index = states)
    
    # initailize matrix
    for state in states:
        probability_matrix.at[state, 0] = 0.5 * states_emission_matrix.at[state, string_x[0]]
    
    # dynanmic programing
    for string_index in range(1, len(string_x)):
        emission = string_x[string_index]
        
        for state in states:
            max_prob = -float('Inf')
            
            for state_index in range(len(states)):
                prob = probability_matrix.at[states[state_index], string_index - 1] * transition_matrix.at[states[state_index], state] * states_emission_matrix.at[state, emission]
                if prob > max_prob:
                    max_prob = prob
                    
            probability_matrix.at[state, string_index] = max_prob
    
    # find max probability
    max_prob = -float('Inf')
    for state in states:
        if probability_matrix.at[state, len(string_x) - 1] > max_prob:
            max_prob  = probability_matrix.at[state, len(string_x) - 1]
            max_state = state
    
    # backtrack
    hidden_path = max_state
    for string_index in range(len(string_x) - 1, 0, -1):
        emission = string_x[string_index]

        for state in states:
            if probability_matrix.at[max_state, string_index] == probability_matrix.at[state, string_index - 1] * transition_matrix.at[state, max_state] * states_emission_matrix.at[max_state, emission]:
                max_state = state

                break

        hidden_path = max_state + hidden_path
 
    return(hidden_path)

# Test
string_x = 'zxxxxyzzxyxyxyzxzzxzzzyzzxxxzxxyyyzxyxzyxyxyzyyyyzzyyyyzzxzxzyzzzzyxzxxxyxxxxyyzyyzyyyxzzzzyzxyzzyyy'
xs = 'x y z'
states = 'A B'

transition_matrix = '''0.634	0.366
0.387	0.613'''

states_emission_matrix = '''0.532	0.226	0.241
0.457	0.192	0.351'''

Viterbi_algorithm(string_x, xs, states, transition_matrix, states_emission_matrix)

'AAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBAAA'

In [19]:
'''
CODE CHALLENGE: Solve the Outcome Likelihood Problem.
Input: A string x, followed by the alphabet from which x was constructed,
     followed by the states States, transition matrix Transition, and emission matrix
     Emission of an HMM (Σ, States, Transition, Emission).
Output: The probability Pr(x) that the HMM emits x.

Note: You may assume that transitions from the initial state occur with equal probability.
'''

def probability_of_HMM_emits(string_x, xs, states, transition_matrix, states_emission_matrix):
    # formating all parameters
    if type(xs) == str:
        xs = xs.split(' ')
        
    if type(states) == str:
        states = states.split(' ')
    
    transition_matrix      = transition_matrix.split('\n')
    for i in range(len(transition_matrix)):
        transition_matrix[i] = transition_matrix[i].split('\t')
        transition_matrix[i] = list(map(float, transition_matrix[i]))
        
    transition_matrix = pd.DataFrame(transition_matrix, columns = states, index = states)
    
    states_emission_matrix = states_emission_matrix.split('\n')
    for i in range(len(states_emission_matrix)):
        states_emission_matrix[i] = states_emission_matrix[i].split('\t')
        states_emission_matrix[i] = list(map(float, states_emission_matrix[i]))
        
    states_emission_matrix = pd.DataFrame(states_emission_matrix, columns = xs, index = states)
    
    probability_matrix = np.full([len(states), len(string_x)], 0, float)
    probability_matrix = pd.DataFrame(probability_matrix, columns = list(range(len(string_x))), index = states)
    
    # initailize matrix
    for state in states:
        probability_matrix.at[state, 0] = 0.5 * states_emission_matrix.at[state, string_x[0]]
    
    # fill the matrix
    for string_index in range(1, len(string_x)):
        emission = string_x[string_index]
        
        for state in states:
            sum_prob = 0
            
            for state_index in range(len(states)):
                prob = probability_matrix.at[states[state_index], string_index - 1] * transition_matrix.at[states[state_index], state] * states_emission_matrix.at[state, emission]
                sum_prob = sum_prob + prob
                    
            probability_matrix.at[state, string_index] = sum_prob
    
    probability = sum(probability_matrix[len(string_x) - 1])
    
    return(probability)

# Test
string_x = 'zyzzyyyzzzxxzxxzzxxxxxxzyyyyxzzyxzxzxyxyzyyxzxzxxzxzzzzzzzyxxyzzzzzxxzxyxyzxyyyzxxyzxzxzyyyzyxzzxxzz'
xs = 'x y z'
states = 'A B'

transition_matrix = '''0.597	0.403
0.268	0.732'''

states_emission_matrix = '''0.427	0.268	0.305
0.226	0.32	0.454'''

probability_of_HMM_emits(string_x, xs, states, transition_matrix, states_emission_matrix)

7.04227143648133e-48