In [9]:
# Manipulation DNA sequences
from Bio.Seq import Seq
my_dna = Seq("AGTACACTGGT")
print(my_dna)
print(my_dna.count("ACT"))
print(Seq("AAAA").count_overlap("AA"))
print(my_dna.complement())
print(my_dna.reverse_complement())

AGTACACTGGT
1
3
TCATGTGACCA
ACCAGTGTACT


In [14]:
# Concatenate two partial overlapping sequences
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
seq1 = Seq('AGTCGGTACAAATTACACAGCATATGCTTTACTGTTAACCACA')
print(seq1)
seq2 = Seq('GTTAACCACATGCGTAGCCACTTGCAATGTCTC')
print(seq2)
a_good_alignment = pairwise2.align.localms(seq1, seq2, 2, -1, -3, -3)
print(format_alignment(*a_good_alignment[0]))

def make_me_a_consensus(algn):
    consensus=[]
    for s1, s2 in zip(algn[0][0],algn[0][1]):
        if s1 == s2:
            consensus.append(s1)
        elif s1 == '-':
            consensus.append(s2)
        elif s2 == '-':
            consensus.append(s1)
    return ''.join(consensus)
make_me_a_consensus(a_good_alignment)

AGTCGGTACAAATTACACAGCATATGCTTTACTGTTAACCACA
GTTAACCACATGCGTAGCCACTTGCAATGTCTC
34 GTTAACCACA
   ||||||||||
 1 GTTAACCACA
  Score=20



'AGTCGGTACAAATTACACAGCATATGCTTTACTGTTAACCACATGCGTAGCCACTTGCAATGTCTC'

In [67]:
# Enumerate k-mer sequences
import itertools
basic_dna = "ACGT"
k = 4
l = [deque(list(tup)) for tup in itertools.product(basic_dna, repeat = k)]
print(l)


[deque(['A', 'A', 'A', 'A']), deque(['A', 'A', 'A', 'C']), deque(['A', 'A', 'A', 'G']), deque(['A', 'A', 'A', 'T']), deque(['A', 'A', 'C', 'A']), deque(['A', 'A', 'C', 'C']), deque(['A', 'A', 'C', 'G']), deque(['A', 'A', 'C', 'T']), deque(['A', 'A', 'G', 'A']), deque(['A', 'A', 'G', 'C']), deque(['A', 'A', 'G', 'G']), deque(['A', 'A', 'G', 'T']), deque(['A', 'A', 'T', 'A']), deque(['A', 'A', 'T', 'C']), deque(['A', 'A', 'T', 'G']), deque(['A', 'A', 'T', 'T']), deque(['A', 'C', 'A', 'A']), deque(['A', 'C', 'A', 'C']), deque(['A', 'C', 'A', 'G']), deque(['A', 'C', 'A', 'T']), deque(['A', 'C', 'C', 'A']), deque(['A', 'C', 'C', 'C']), deque(['A', 'C', 'C', 'G']), deque(['A', 'C', 'C', 'T']), deque(['A', 'C', 'G', 'A']), deque(['A', 'C', 'G', 'C']), deque(['A', 'C', 'G', 'G']), deque(['A', 'C', 'G', 'T']), deque(['A', 'C', 'T', 'A']), deque(['A', 'C', 'T', 'C']), deque(['A', 'C', 'T', 'G']), deque(['A', 'C', 'T', 'T']), deque(['A', 'G', 'A', 'A']), deque(['A', 'G', 'A', 'C']), deque(['A', '

In [74]:
# Generate k-mer edges
# Create a dict recording the k-mer sequence and the ID.
from collections import deque
kmer_dict = {}

import copy
for kmer_q in l:
    kmer = "".join(kmer_q)
    if kmer not in kmer_dict:
        kmer_dict[kmer] = []
    for new_ch in ['A', 'C', 'G', 'T']:
        kmer_next = copy.deepcopy(kmer_q)
        kmer_next.popleft()
        kmer_next.append(new_ch)
        kmer_str = "".join(kmer_next)
        kmer_dict[kmer].append(kmer_str)

with open('4-mer.txt', 'w') as f:
    for k,v in kmer_dict.items():
        for i in v:
            f.write(k + "\t" + i + "\n")
f.close()