In [63]:
import itertools
def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(itertools.islice(it, n))
    if len(result) == n:
        yield result    
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [64]:
def count_kmers(read, k):
    """Count kmer occurrences in a given read.

    Parameters
    ----------
    read : string
        A single DNA sequence.
    k : int
        The value of k for which to count kmers.

    Returns
    -------
    counts : dictionary, {'string': int}
        A dictionary of counts keyed by their individual kmers (strings
        of length k).

    Examples
    --------
    >>> count_kmers("GATGAT", 3)
    {'ATG': 1, 'GAT': 2, 'TGA': 1}
    """
    # Start with an empty dictionary
    counts = {}
    nucleotides = ['A', 'T', 'G', 'C']
    # Add the kmer to the dictionary if it's not there
    for i in itertools.product(nucleotides, repeat=k):
        counts["".join(i)] = 0
    # Calculate how many kmers of length k there are
    num_kmers = len(read) - k + 1
    # Loop over the kmer start positions
    for kmer in window(read, n=k):
        kmer = "".join(kmer)
        if kmer not in counts:
            continue
        # Increment the count for this kmer
        counts[kmer] += 1
    # Return the final counts
    return counts

In [59]:
import concurrent.futures
import Bio.SeqIO as SeqIO
from Bio.Seq import Seq

def find_orfs(seq_record):
    """
    Find ORFs in a DNA sequence record.
    """
    #seq = seq_record 
    seq = seq_record.seq
    
    orfs = []
    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
        for frame in range(3):
            length = 3 * ((len(seq)-frame) // 3)
            trans = str(nuc[frame:frame+length].translate(table=1))
            for start in range(len(trans)):
                if trans[start] == "M":
                    end = trans.find("*", start)
                    if end != -1:
                        orfs.append((start*3+frame, end*3+frame, strand))
    return orfs

In [62]:
# Read DNA sequence file
filename = "At.fna"
seq_records = list(SeqIO.parse(filename, "fasta"))

# Find ORFs in parallel
n_threads = 4
orf_lists = []
with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
    orf_worker = tuple(executor.map(find_orfs, seq_records))
    for done in concurrent.futures.as_completed(orf_worker):
        result = done.result()
        orf_lists.append(result)

TypeError: unhashable type: 'list'