In [49]:
from collections import defaultdict

f = open("dataset_865533_10.txt","r")
Text = f.readline().strip('\n')
Patterns = [str(x) for x in next(f).split()]
d = int(f.readline())
f.close()

def MultipleApproximatePatternMatching(Text, Patterns, d):
    if Text[-1] != '$':
        Text = Text + "$"
    suffix_array = SuffixArray(Text)
    Transform = BWTfromSuffixArray(Text, suffix_array)
    
    PatternMatchIndices = defaultdict(set)
    
    match_candidates = BetterBWMatchingPatterns(Transform, Patterns, d)
    
    for i in range(len(Patterns)):
        if match_candidates[i] != 0:
            Pattern = Patterns[i]
            ranges = match_candidates[i]
            
            for j in range(len(ranges)):
                for first_column_index in range(ranges[j][0][0], ranges[j][0][1] + 1):
                    offset = ranges[j][1]
                    text_position = suffix_array[first_column_index] - offset
                    if text_position >= 0:
                        if ApproximateMatch(Pattern, Text[text_position: text_position + len(Pattern)], d):
                            PatternMatchIndices[Pattern].add(text_position)
    
    return PatternMatchIndices

def ApproximateMatch(Pattern, TextSubstring, d):
    mismatches = 0
    for i in range(len(Pattern)):
        if Pattern[i] != TextSubstring[i]:
            mismatches += 1
            if mismatches > d:
                return 0
    return 1
            

def SeededPattern(Pattern, d):
    k = int(len(Pattern) / (d + 1))
    seeds = []
    for i in range(d):
        seeds.append((Pattern[k * i: k * (i+1)], k * i))
    seeds.append((Pattern[k * d:], k * d))
    return seeds

def SuffixArray(Text):
    suffixes = []
    for i in range(len(Text)):
        suffixes.append((Text[i:], i))
    
    suffixes.sort()
    
    return [pair[1] for pair in suffixes]

def BWTfromSuffixArray(Text, suffix_array):
    Transform = ""
    for index in suffix_array:
        if index == 0:
            Transform += Text[-1]
        else:
            Transform += Text[index - 1]
    return Transform

def BetterBWMatchingPatterns(Transform, Patterns, d):
    Count = defaultdict(list)
    FirstOccurrence = dict()
    lastFreqs = defaultdict(int)
    symbols = list()
    
    for i in range(len(Transform)):
        symbol = Transform[i]
        if symbol not in symbols:
            symbols.append(symbol)
        
        if len(Count[symbol]) == 0:
            Count[symbol] = [0] * (i+1)
        elif len(Count[symbol]) < i + 1:
            Count[symbol] += [Count[symbol][-1]] * (i + 1 - len(Count[symbol]))
            
        Count[symbol] += [Count[symbol][-1] + 1]
        
    for symbol in Count:
        if len(Count[symbol]) < len(Transform) + 1:
            Count[symbol] += [Count[symbol][-1]] * (len(Transform) + 1 - len(Count[symbol]))
            
    symbols.sort()
    count_symbols = 0
    for symbol in symbols:
        if symbol == '$':
            FirstOccurrence.update({symbol: 0})
            count_symbols += Count[symbol][-1]
        else:
            FirstOccurrence.update({symbol: count_symbols})
            count_symbols += Count[symbol][-1]
            
    SubstringMatches = [0] * len(Patterns)
    for i in range(len(Patterns)):
        Pattern = Patterns[i]
        seed_matches = []
        pattern_index = -1
        for seed in SeededPattern(Pattern, d):
            seed_match = BetterBWMatching(FirstOccurrence, Transform, seed[0], Count)
            if seed_match:
                #pattern_index = seed[1]
                seed_matches.append((seed_match, seed[1]))
        
        SubstringMatches[i] = seed_matches
    
    return SubstringMatches

def BetterBWMatching(FirstOccurrence, LastColumn, Pattern, Count):
    top = 0
    bottom = len(LastColumn) - 1
    
    while top <= bottom:
        if len(Pattern):
            symbol = Pattern[-1]
            Pattern = Pattern[:-1]
            foundFlag = 0
            
            for i in range(top, bottom + 1):
                if symbol == LastColumn[i]:
                    top = FirstOccurrence[symbol] + Count[symbol][top]
                    bottom = FirstOccurrence[symbol] + Count[symbol][bottom + 1] - 1
                    foundFlag = 1
                    break
            
            if not foundFlag:
                return 0
            
        else:
            return top, bottom
    
    return 0
    
    
Matches = MultipleApproximatePatternMatching(Text, Patterns, d)
for pattern in Matches:
    out = pattern + ": "
    indices_list = list(Matches[pattern])
    indices_list.sort()
    for index in indices_list:
        out += str(index) + " "
    print(out.strip(" "))

GTGTGCGGGTTACCTAGGGCCACTGTTGAGATAGCCGCCAGCGCATGATGCTCGTGAGATCTATACGAGTTA: 8540
CAGCGCCGGTCAAAACGTGCTTAATATGCGTTTTAGAATCGACATGGCGAGCAAGGACAGGGTCCTGTTGTG: 5771
CCGTTTTGAAGCATAGCCAAAGAGTGTGGCTGAGCTGATCTTGACCTCATTTGGTCGTGATGGGACCGATTG: 1367
TCGTGGAGAGATCATGCGGTTGCCACACCAGCCTACCTTGATATCCGCCAATCTAAGGCGGGCCGAGGCCACAT: 3912
CCTCGAACGCTTCCCATTCCAACATACATTCATGCGCGCCGTTTTATTTGAACGCGGCATCTTTCAAGGCA: 4509
AAAAAAGAGGTCTCGATCACTCGATAGGCGTTCAAAACTTTCATATGCTCCGTCTGTATCGTACA: 1883
TTTGCAGGTCCCAGTTCGCGAGCATAACGCCCGCGATATTATTGTCTGTACCATGATACTCTAGTG: 7777
GAGTCCTTCATTTAGAACGCGATAAAGACGATAACGCGCCACTTACTTTCCTAGCGCAGCGG: 6469
CGCTAAGGATATGTTATGCAAGTAATCTAGGCCCACGCAAGTTCTCGTCACATGCCCAATGGAGCGAGCTCATGT: 5520
CTTGCGGAGTGTTTATGACTCCGCGAATATCGCGACTCTTCTCGAGACGGATTATGTGAACAGA: 8330
GGCATCAGTATTTAATACTACCGGTTAAGTTTAGCTAGACGGCGTAATACTTCTATGTGCTT: 6094
GCACCGCGAGCAAGCGGCACAATCTCGCTTGTAAGTGCGTTGGAGGAATTCGGCAGATGTTAGGTGTAGACC: 2820
CTTGCTACAGATCCTACTTTGGCGCATCCATCGAGAGGAGCCGACACCCCCATTTATGACCTGGGTTGGTAGCAC: 6761
AAACATA