In [30]:
from collections import defaultdict
from copy import deepcopy
from random import randint

f = open("dataset_865404_16.txt","r")

k, d = [int(x) for x in next(f).split()]
pair_strings = [str(x) for x in next(f).split()]
GappedPatterns = []
for pair in pair_strings:
    strings = [str(x) for x in pair.split('|')]
    tup = (strings[0], strings[1])
    GappedPatterns.append(tup)

def StringReconstructionReadPairs(PairedReads, k, d):
    graph = DeBrujinString(PairedReads)
    path = EulerianPath(graph)
    
    path_as_tup =[]
    for pair in path:
        strings = [str(x) for x in pair.split('|')]
        tup = (strings[0], strings[1])
        path_as_tup.append(tup)
    
    Text = GappedGenomePath(path_as_tup, k, d)
    return Text

def GappedGenomePath(GappedPatterns, k, d):
    FirstPatterns = [x[0] for x in GappedPatterns]
    SecondPatterns = [x[1] for x in GappedPatterns]
    PrefixString = StringFromGenomePath(FirstPatterns)
    SuffixString = StringFromGenomePath(SecondPatterns)
    
    for i in range(k + d + 1, len(PrefixString)):
        if PrefixString[i] != SuffixString[i - k - d]:
            return "there is no string spelled by the gapped patterns"
        
    return PrefixString + SuffixString[len(SuffixString) - k - d:]

def DeBrujinString(Patterns):
    graph = defaultdict(lambda: defaultdict(lambda: 0))
    for i in range(len(Patterns)):
        PrefixPair = Patterns[i][0][:len(Patterns[i][0]) - 1] + "|" + Patterns[i][1][:len(Patterns[i][0]) - 1]
        SuffixPair = Patterns[i][0][1:] + "|" + Patterns[i][1][1:]
        graph[PrefixPair][SuffixPair] += 1
    
    reduced_graph = defaultdict(list)
    for node1 in graph:
        for node2 in graph[node1]:
            for i in range(graph[node1][node2]):
                reduced_graph[node1].append(node2)
    
    return reduced_graph

def StringFromGenomePath(path):
    Text = path[0]
    for i in range(1, len(path)):
        Text += path[i][-1]
        
    return Text

def EulerianPath(graph): 
    s = list(graph.keys())[0]
    end = FindEnd(graph, s)
    graph_r = ReverseGraph(graph)
    return FindPath(graph_r, end)

def FindEnd(graph, start): 
    A = deepcopy(graph) #incidence list
    
    edges_per_vertex = dict()
    for key in A:
        edges_per_vertex[key] = len(A[key])
    
    L = [] #stack
    K = [] #cycle
    
    s = start
    L.append(s)
    v = s
    
    while len(L) > 0:
        if v not in edges_per_vertex:
            return v
        
        if edges_per_vertex[v] > 0:
            i = randint(0, len(A[v]) - 1)
            L.append(v)
            u = A[v][i]
            edges_per_vertex[v] -= 1
            A[v].pop(i)
            v = u
        else:
            K.append(v)
            v = L[-1]
            L.pop()
    
    return K[0]
        
def FindPath(graph, end):
    A = deepcopy(graph) #incidence list
    
    edges_per_vertex = dict()
    for key in A:
        edges_per_vertex[key] = len(A[key])
    
    L = [] #stack
    K = [] #cycle
    
    s = end
    L.append(s)
    v = s
    start_flag = 0
    
    while len(L) > 0:
        
        if v not in edges_per_vertex:
            start_flag = 1
            start = v
            A[v].append(end)
            edges_per_vertex[v] = 0
            edges_per_vertex[v] += 1
        
        if edges_per_vertex[v] > 0:
            L.append(v)
            u = A[v][-1]
            edges_per_vertex[v] -= 1
            A[v].pop()
            v = u
        else:
            K.append(v)
            v = L[-1]
            L.pop()
    
    return K[1:] if start_flag else K
    

def ReverseGraph(graph):
    graph_r = defaultdict(list)
    for key in graph:
        for i in graph[key]:
            graph_r[i].append(key)
            
    return graph_r

print(StringReconstructionReadPairs(GappedPatterns, k, d))
f.close()

ACGCGGGGCACCAATGTAACTGGCTCATATTCGTTTTACATACACCATGCGCATGTACGACGCCGGATTCGTTGCACGGTGACGTCCCAGCCACAGGGGAAGCCAGGAAACAGTCTCTTGCGGGAGGCTACCCTCAGCGATAACGTGCCAAGTTCCCCCTCCGAAATCTGGTTGTTCGTGCTGGTGGCTGTCATTTGTAACTAATCTTGGTTGCGGCACTTCACACAGTTAATGGGCATCTGCAGGATACTGCAGCCAACGCCACTATGCGTTATATTGCGGCCCCCTATGGTTGGGTACCGCAAAACGAGGTCCAGGTTCATCTACTGTGGGTGGTCACAACACCGGGTTAAGAACCCTCACTCCACCCCGCAATCTACGTACTACATACTACAACGTTAGGGGTGCTGAAATGCCAGGCCATCTTCCAAAACATTCCCTAGGGCTTGGGCACCTAAATCCTTGTCACGGCGCTCGGCAGCGCGTTGAGTCGCCAGGCGTACGAATTTGCTGATAATAGCTGTGCGCGATCATAGAGTGAGACCAGAAGTAAGTCTCTGGCCGTGGTGTCTCTTTACCCTGCGGCGTTGCACCTCTATTGCGGTCTACAGGAAGGGTCAGACGCGATACTCTATCTCGGTAGGTACAAGAGCGAAAAGGTAGCACGTCAGCCGATAATATCATTATGATCGGAAGTCGGATCGGTAAGCTGACATTAAACTCAGACCGACTCTTCCAAAACATTCCCTAGGGCTTGGGCACCTAAATCCTTGTCACGGCGATATACGAGCAATTTTAGATATGCTTCTTCCAAAACATTCCCTAGGGCTTGGGCACCTAAATCCTTGTCACGGCGTCATTATCTGTCTAGTAGGGTATTTCGATAACACCCGAACATGAAAGGCCGACCCACCCTATCGATCTTGCCTAAGGCGAAATCCTCCACGGTCTTTCGTCGCAGGTCAGCACGTCGTTCGTGGGAAACAGTTCCATAACATCT