Edit the cell below to input your desired values. You can also input values using the input scanner.

In [7]:
dna_string = "ACGT"
dna_length = 100
kmer_length = 7


# K-mer distribution using a BST

In [8]:
from data_processing import generate_kmers as generator
from data_processing import graph_distribution as graph
from data_processing import distribution_parser as parse
from data_structures import binary_search_tree as bst

import matplotlib as plt

import time

In [9]:
def generate_tree(kmer_list):
    """
    Generates a binary search tree from a list of kmers. Returns the root node of the tree.
    """
    if(len(kmer_list) == 0):
        return

    root = bst.create_tree()
    for kmer in kmer_list:
        bst.add_node(root, kmer)
    return root

In [10]:
def generate_distribution(root):
    """
    Creates a kmer distribution from a binary search tree via inorder traversal.
    """
    kmer_distribution = []

    def traverse(root):
        if root is not None:
            traverse(root.left)
            kmer_distribution.append((root.value, root.counter))
            traverse(root.right)

    traverse(root)
    return kmer_distribution

In [11]:
def test_bst_kmer_distribution(dna_string, dna_length, kmer_length):
    """
    Utility function to create and test the binary search tree
    """
    dna_sequence = generator.create_dna_sequence(dna_string, dna_length)
    kmer_list = generator.generate_kmers(dna_sequence, kmer_length)
    
    start = time.time()
    root = generate_tree(kmer_list)
    end = time.time()
    print(f"Time to insert {len(kmer_list)} kmers: {end - start} seconds")

    start = time.time()
    kmer_distribution = generate_distribution(root)
    end = time.time()
    print(f"Time to generate distribution via inorder traversal: {end - start} seconds")
    graph.bar_graph(kmer_distribution)

    unique_kmers = parse.get_unique_kmers(kmer_distribution)

    if len(unique_kmers) == 0:
        print("No unique k-mers found")
    else:
        print(f"Unique k-mers: {len(unique_kmers)}")
        graph.bar_graph(unique_kmers)

    if (len(kmer_distribution) > 100):
        print("Most common 10% kmers: ")
        most_common_kmers = parse.get_most_common_kmers(kmer_distribution, int(len(kmer_distribution) * 0.10))
        graph.bar_graph(most_common_kmers)


In [12]:
def main():
    # # uncomment if you would like to insert your own data manually
    # inputs = parse.get_inputs()
    # dna_length = inputs["dna_length"]
    # dna_string = inputs["dna_string"]
    # kmer_length = inputs["kmer_length"]

    test_bst_kmer_distribution(dna_string, dna_length, kmer_length)

main()

Time to insert 94 kmers: 0.00018715858459472656 seconds
Time to generate distribution via inorder traversal: 8.082389831542969e-05 seconds


Unique k-mers: 94
