#### A Function to compute the GC percentage of A DNA sequence

In [5]:
def gc(dna):
    "this function computes the GC percentage of a dna sequence"
    #Ingore undefined places on DNA
    nbases=dna.count('n')+dna.count('N')
    gcpercent=float(dna.count('c') + dna.count('C') + dna.count('g') 
                    + dna.count('G'))*100.0/(len(dna)-nbases)
    return gcpercent

In [6]:
gc('AAAGTNNAGTCC')

40.0

####Check if a given DNA sequence contains a in-frame stop codon

In [25]:
def has_stop_codon(dna,frame=0): #frame=0 means set frame=0 as default
    "This function checks if given dna sequence has in frame stop codons."
    stop_codon_found=False
    stop_codons=['tga','tag','taa']
    for i in range(frame,len(dna),3):
        codon=dna[i:i+3].lower()
        if codon in stop_codons:
            stop_codon_found=True
            break
    
    return stop_codon_found
    

In [27]:
print has_stop_codon('atgagcggccggct')
print has_stop_codon('atgagcggccggct',1)
print has_stop_codon('atgagcggccggct',2)

False
True
False


####Generate the reverse complement strand based on one given DNA strand

In [29]:
def reverse_string(seq):
    return seq[::-1]
def complement(dna):
    "Return the complementary sequence string"
    basecomplement = {'A':'T','C':'G','G':'C','T':'A','N':'N',
                     'a':'t','c':'g','g':'c','t':'a','n':'n'}
    letters=list(dna)
    letters=[basecomplement[base] for base in letters]
    return ''.join(letters)
    

In [31]:
def reversecomplement(seq):
    "Return the reverse complement of the dna string."
    seq = reverse_string(seq)
    seq = complement(seq)
    return seq

In [32]:
reversecomplement('CCGGAAGAGCTTACTTAG')

'CTAAGTAAGCTCTTCCGG'

####Reading a FASTA File

__Exercise__: Build a dictionary containing all sequences from a FASTA file

In [76]:
#Check if the filename exists
try:
    with open ("myfile.fa") as f:
        data=f.read()
        data=data.split('\n')
except IOError:
    print("File does not exist!")

In [77]:
seqs={}
for line in data:
    line=line.rstrip()
    #distinguish header from sequence
    if line[0] =='>':    #or line.startswith('>')
        words=line.split()
        name=words[0][1:]  #We don't want to have ">" sign
        seqs[name]=""
    else:
        seqs[name]= seqs[name] + line        

In [78]:
seqs

{'id1': 'ATGTGTGTCCGTTGTGTAAAGTGTGTCcccgtgttATggtagatttttga',
 'id2': 'ccccagtggggagtagggcAAAcgtatAA'}

In [79]:
for name,seq in seqs.items():
    print name, seq

id2 ccccagtggggagtagggcAAAcgtatAA
id1 ATGTGTGTCCGTTGTGTAAAGTGTGTCcccgtgttATggtagatttttga


####Regular expressions

In [1]:
import re

In [2]:
#re.search(pattern, string)
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")


restriction site found!


In [3]:
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):   #(A|T) means either A or T
    print("restriction site found!")

In [4]:
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")

In [5]:
#Same as above. Pattern GC[ATGC]GC will match GCAGC, GCTGC, GCGGC and GCCGC.
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")

In [6]:
#Putting a caret ^ at the start of a character group like this [^XYZ] will negate it, 
#and match any character that isnâ€™t in the group
dna = "ATCGCGAATTCAC"
if re.search(r"[^GC]", dna):
    print("restriction site found!")


restriction site found!


Quantifiers:   
?  immediately following a character - it can match either __zero or one times__  
\+  immediately following a character - the character must be present, it will match __one or more times__  
\*  immediately following a character - the character are opitonal, it will match __zero ro more times__  
{ }  __Single number__ inside curly brackets will match exactly that number of repeats.  __Pair of numbers__ inside curly brackets separated with a comma allows us to specify an acceptable range of number of repeats   

In [7]:
#group method
dna = "ATGACGTACGTACGACTG"
 
# store the match object in the variable m
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

GACGTAC


In [8]:
dna = "ATGACGTACGTACGACTG"
 
# store the match object in the variable m
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [9]:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [10]:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

['ATTATAT', 'AAATTATA']


In [11]:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26
