In [4]:
# %load ./locate_db_cycles.py
import os 
from collections import defaultdict
import networkx as nx

data_dir = '../output'

# setting a kmer size value 
k = 20 

# instantiating kmer_ids  
kmer_ids = {}
kmer_num = 0

# instantiating the digraph
db_graph = nx.DiGraph()

print('Constructing the de Bruijn graph.')
# adding di bruijn edges to graph 
seqs_fn = os.path.join(data_dir, 'SRR7225827.seqs')
with open(seqs_fn) as f: 
    for line_num, line in enumerate(f):
        line = line.strip()
    
        #if line_num % 1000000 == 0:
        if (line_num + 1) % 10000 == 0:
            print('\tProcessing line number: {}'.format(line_num))
            break
        
        for i in range(len(line) - k):
            
            # get the first kmer
            first_kmer = line[i: i + k]
            
            # get the id of the first kmer 
            if first_kmer not in kmer_ids: 
                kmer_ids[kmer_num] = first_kmer
                kmer_num += 1
                first_id = kmer_ids[first_kmer]
            else:
                first_id = kmer_ids[first_kmer]

            # get the second kmer
            second_kmer = line[i + 1: i + k + 1]

            # get the id of the second kmer 
            if second_kmer not in kmer_ids: 
                kmer_ids[second_kmer] = kmer_num 
                kmer_num += 1
                second_id = kmer_ids[second_kmer]
            else:
                second_id = kmer_ids[second_kmer]

            db_graph.add_edge(first_id, second_id)

Constructing the de Bruijn graph.
	Processing line number: 9999


In [5]:
# locating all cycles in the graph 
print('Locating all cycles in the graph.')
cycles = nx.simple_cycles(db_graph)

Locating all cycles in the graph.


In [6]:
# removing self-loops
print('Removing self-loops.')
cycles = [cycle for cycle in cycles if len(cycle) > 1]

Removing self-loops.


In [7]:
# writing all the cycles in the graph 
print('Writing all the cycles in the graph.')
cycles_fn = os.path.join(data_dir, 'SRR7225827.cycles.txt')
with open(cycles_fn, 'w') as f: 
    
    # writing all other cycles
    for cycle in cycles:

        # adding the very first node of cycle to a reformatted cycle 
        re_cycle = kmer_ids[cycle[0]]
        for node in cycle[1:]: 
            node_seq = kmer_ids[node]
            re_cycle += node_seq[-1]
        
        # calculating the length of the cycle
        cycle_len = len(re_cycle)

        # saving the cycle with its length 
        f.write('{}\t{}\n'.format(re_cycle, cycle_len))

Writing all the cycles in the graph.


KeyError: 130272

In [8]:
kmer_ids

{'GTGTAGAGGGAAGGTTAATG': 0,
 'TGTAGAGGGAAGGTTAATGG': 1,
 'GTAGAGGGAAGGTTAATGGT': 2,
 'TAGAGGGAAGGTTAATGGTT': 3,
 'AGAGGGAAGGTTAATGGTTG': 4,
 'GAGGGAAGGTTAATGGTTGA': 5,
 'AGGGAAGGTTAATGGTTGAT': 6,
 'GGGAAGGTTAATGGTTGATA': 7,
 'GGAAGGTTAATGGTTGATAT': 8,
 'GAAGGTTAATGGTTGATATT': 9,
 'AAGGTTAATGGTTGATATTG': 10,
 'AGGTTAATGGTTGATATTGC': 11,
 'GGTTAATGGTTGATATTGCT': 12,
 'GTTAATGGTTGATATTGCTA': 13,
 'TTAATGGTTGATATTGCTAG': 14,
 'TAATGGTTGATATTGCTAGG': 15,
 'AATGGTTGATATTGCTAGGG': 16,
 'ATGGTTGATATTGCTAGGGT': 17,
 'TGGTTGATATTGCTAGGGTG': 18,
 'GGTTGATATTGCTAGGGTGG': 19,
 'GTTGATATTGCTAGGGTGGC': 20,
 'TTGATATTGCTAGGGTGGCG': 21,
 'TGATATTGCTAGGGTGGCGC': 22,
 'GATATTGCTAGGGTGGCGCT': 23,
 'ATATTGCTAGGGTGGCGCTT': 24,
 'TATTGCTAGGGTGGCGCTTC': 25,
 'ATTGCTAGGGTGGCGCTTCC': 26,
 'TTGCTAGGGTGGCGCTTCCA': 27,
 'TGCTAGGGTGGCGCTTCCAA': 28,
 'GCTAGGGTGGCGCTTCCAAT': 29,
 'CTAGGGTGGCGCTTCCAATT': 30,
 'NCTATAGGCGCTTGTCAGGG': 31,
 'CTATAGGCGCTTGTCAGGGA': 32,
 'TATAGGCGCTTGTCAGGGAG': 33,
 'ATAGGCGCTTGTCAGGGAGG':