## Python for Biologists Chapter 7 Exercises

#### In Chapter Exercises

In [None]:
#Modules in Python
    #You can import modules using 'import'
import re

#Regular Expressions
    #Using functions within a module require the module prefix
re.search(pattern, string)

    #Raw strings: to avoid confunsion, if we put the letter r immediately
    #before the opening quotation mark, then any special characters inside the string
    #are ignored
print(r"\t\n")

    #re.search is a true/false function that determines whether or not a pattern 
    #appears somewhere in a string.
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")
    
    #Using regular expressions allow for more complex search terms
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")
        
    #You can search for character groups using parentheses
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")
    #You can also use a '.' to incude ANY character, or '^' to exclude characters
    
    #Quantifiers allow for searching for a minimum number of appearances
# ? after a character means it appears zero or one times
# + means a character must be present but can appear any number of times
# * means a character is optional, so can be present zero or more times
# {} give a specific number of repeats to look for

    #Positions
# ^ means the beginning of a string
# $ means the end of a string

    #YOU CAN COMBINE ANY/ALL THESE EXPRESSIONS TO SEARCH FOR VERY SPECIFIC THINGS

#Matching a string search
    #re.match will only identify a pattern if it matches the entire string
    #You can store matches in a variable
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

    #Groups allow searching for bits of string
na = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))
    
    #You can find the position of a match in the string searched
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))

    #The module also has a split function
dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

    #re.findall returns a list of all matches of a pattern in a string
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

    #If we want to do anything more complicated than simply extracting the text 
    #of the matches, we need to use the re.finditer method
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))
    

#### Accession Names

In [34]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('5', name):
        print(name)

xkn59438
hedle3455
xjhd53e
45da


In [35]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('[d|e]', name):
        print(name)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [37]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('de', name):
        print(name)

de37dp


In [38]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('d.e', name):
        print(name)

hedle3455


In [39]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('d', name) and re.search('e', name):
        print(name)

eihd39d9
chdsye847
hedle3455
xjhd53e
de37dp


In [40]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('^x', name) or re.search('^y', name):
        print(name)

xkn59438
yhdck2
xjhd53e


In [42]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('^[x|y]', name) and re.search('e$', name):
        print(name)

xjhd53e


In [44]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('[0,1,2,3,4,5,6,7,8,9]{3,10}', name):
        print(name)

xkn59438
chdsye847
hedle3455


In [45]:
acc_names = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']
import re
for name in acc_names:
    if re.search('d[arp]$', name):
        print(name)

45da
de37dp


#### Double Digest

In [6]:
import re
dna = open("dna.txt").read().rstrip("/n")

#create list of cuts starting at 0

cuts =[0]
#loop to make a list of all cut sites for AbcI
for match in re.finditer("A[ATGC]TAAT", dna):
    cuts.append(match.start() + 3)
#loop to make a list of all cut sites for AbcII
    #R = A or G and W = A or T
for match in re.finditer("GC[AG][AT]TG", dna):
    cuts.append(match.start() + 4)
#add length of seq to end of cut list 
    cuts.append(len(dna))
#sort to get in order
    sort = sorted(cuts)
    #loop across range in list to count fragment size
    for seq in range(1,len(sort)):
        current_cut = sort[seq]
        previous_cut = sort[seq-1]
        fragment_size = current_cut - previous_cut
        print(str(fragment_size))

488
655
485
385
488
655
434
51
385
0
