In [2]:
#Chapter 7: Regular Expressions

#DNA and protein sequences can be thought of as strings
    #within these sequences we can look for patterns
        #protein domains, DNA TF binding motifs, restriction enzyme
            #cut sites, PCR primers, runs of mononucleotides
    
            #search for a fixed set of characters - more flexibility may be required

#Modules in Python
    #module: collection of specialized functions and data types
        #Python does not make modules automatically available in a new program
        #load modules with an "import" statement

#module that uses regular expression tools = "re"
import re
    #place load statement at the top of the program

#to use a tool from this module -- prefix it with the module name
    #regular expression search function
#re.search(pattern, string)

#if you forget to import the module you will receive a "NameError" when 
    #you call the function
    
#Raw Strings
    #putting the letter "r" immdeiately before the opening quotation mark and 
        #adding any special characters inside the string to be ignored

print(r"\t\n")

#r = raw = string where special characters are ignored
    #NOTE: r goes outside of quotation marks and is not part of the string
    #output will be what is in ""

\t\n


In [4]:
#Searching for a pattern in a string
#re.search: T/F fxn that determines whether or not a pattern appears
    #somewhere in a string
    #takes 2 arguments: pattern you want to search for, string you want to
        #search in

import re

#Ex. search for EcoRI restriction site in DNA sequence
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")
    
#Used raw notation for the pattern, but it is not necessary as it does not 
    #contain any special characters

#ALTERNATION Ex. with AvaII motif that cuts at two different sites

dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

#OR

if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")
    
#CHARACTER GROUPS Ex. BisI restriction enzyme
    #GC(A|T|G|C)GC will match GCAGC, GCTGC, and GCCGC

if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")

#Note: if you want a character to match any character in the input use a "."
    #GC.GC would match all four possibilities listed above
        #even characters that are not DNA bases (ie. number, special character, etc.)
        
#Note: to specify a character you don't want to match, put a "^" at the start 
    #of a character group to negate it and match any character that isn't in the group
        #Ex. [^XYZ]

restriction site found!


In [5]:
#QUANTIFIERS -- allow us to describe variation in the # of times a section
    #of a pattern is repeated
    
    # "?" following a character = character is optional and can match either
        #zero or one times
        #Ex. "GAT?C" = T is optional --> pattern can match GATC or GAC
    #apply the "?" to a group with ()
        #GGG(AAA)?TTT --> GGGAAATTT or GGGTTT
    
    # "+" following a character group = character/group must be present but 
        #can be repeated any number of times (one or more)
            #Ex. "GGGA+TTT" will match any number of A's between three G's and 2 T's
            
    # "*" following character group = character/group is optional and can be repeated
        #will match zero or more times
        
    #use {} to specify a specific number of repeats
        #following the character or group with a signle number inside {} will
            #match exactly that number of repeats
                #Ex. GA{5}T --> GAAAAAT
        #pair of #'s separated with a comma specifies a range og repeats
            #Ex. GA{2,4}T --> GAAT, GAAAT, or GAAAAT
            
#POSITIONS - represents postitions in the input string

# "^" = start of the string
# "$" = end of the string

#Ex. ^AAA will match AAATTT but not GGGAAATTT
#Ex. GGG$ will match AAAGGG but not AAAGGGCCC

#COMBINING - use quantifiers, alternations and character groups together to
    #specify incredibly flexible patterns
    
#Ex. ^ATG[ATGC]{30,1000}A{5,10}$
    #pattern will match
        #an ATG start codon at the beginning of the sequence
        #followed by 30-1000 bases that could be A,T,C, or G
        #followed by a poly-A tail between 5-10 bases at the end
        
#re.search vs. re.match
    #re.search: identify a pattern occurring anywhere in the string
    #re.match: identify a pattern only if it matches the entire string

In [6]:
#Extracting the part of the string that matched

#to find what part of the string matched a pattern, we need to store the
    #result using re.search, then use the "group" method on the stored obj.
    
#stored objects represent the results of a re search rather than a simple
    #number or string

#methods to get data out of a match object
   #group - get the portion of the input string that matched the pattern
    
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

#search the dna sequence for GA (followed by 3 bases) and AC
    #group method allows us to see the part of the DNA seq that matched
    
#Alternatively, you can CAPTURE a pattern by surrounding the part you
    #want to extract with ()

dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

#output will show that 3 bases in first variable are CGT and the 2 bases
    #in the second variable are GT

GACGTAC
entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [7]:
#Getting the position of a match

#match object holds info about the content and position of the match
    # "start" and "end" methods get the positions of the start and end of 
        #the pattern on the sequence
        
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))

#Remember, start counting from zero
#Get the start and end positions of individual groups by supplying a number
    #as the argument to start and end
    
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [8]:
#Splitting a string using a regular expression

#re module has a split function that takes a re pattern as an argument
    #first argument: pattern
    #second argument: string to be split

#Ex. split the string wherever you see a base that is not A,T,C, or G:

dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

#Remember: "^" at the start of a character group negates it
    #return value: list of strings

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [9]:
#Finding multiple matches

# re.findall - returns a list of all matches of a pattern in a string
    #first argument: pattern
    #second argument: string

dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

#Note: findall method is not a match object but rather a list of strings
    #therefore, no way to extract the positions
    
# re.finditer - returns a sequence of match objects -- must use the 
    #return value in a loop
    
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))
    
# finditer grants more flexibility than findall

['ATTATAT', 'AAATTATA']
AT rich region from 5 to 12
AT rich region from 18 to 26
