In [1]:
import re
#important for creating regular expressions
#format of expression is re.tool(pattern,string)
#to use certain special characters as their literal character, use (r"string")
#simplest tool is re.search
dna="ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [2]:
if re.search(r"GGACC",dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

In [3]:
#better way to express the same condition:
#| character = "or" for items on either side
if re.search(r"GG(A|T)CC",dna):
    print("restriction site found")

In [4]:
#similar example using BisI enzyme site (cuts at GCNGC where N is any base)
if re.search(r"GC(A|T|G|C)GC",dna):
    print("restriction site found")

In [8]:
#better way to express the same using []
#[] characters match to any one of the internal characters
dna="ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found")

In [9]:
#use ^ at the start of a character group, [^], to negate contents
#quantifier features describe variation in times a pattern is repeated
#? matches preceding character (or parentheses contents) either 0 or 1 time
#+ matches preceding character (or parentheses contents) 1+ times
#* matches preceding character (or parentheses contents) 0+ times (optional-infinite)
#{} specify # of matches (or range of matches if {#,#})
#positions in a string defined by
#^ matches start of a string (for following characters)
#$ matches end of a string (for preceding characters)


In [11]:
#when regular expression=True, a "match object" is produced, can be accessed
#.group method pulls out all or internal parts of a match object
#.group() or .group(0) pulls out the exact match, shows what the variable sections of expression accepted
dna="ATGACGTACGTACGACTG"
m=re.search(r"GA[ATGC]{3}AC",dna)
print(m.group())

GACGTAC


In [12]:
#to capture specific portions of a potential match, surround with parentheses, each sequent group will be .group(1+)
m=re.search(r"GA([ATGC]{3})AC([ATCG]{2})AC",dna)
print("entire match "+m.group())
print("first bit "+m.group(1))
print("second bit "+m.group(2))

entire match GACGTACGTAC
first bit CGT
second bit GT


In [13]:
#to collect start and end positions of a match or portions of a match, use .start(group_#) and .end(group_#)
#positions are integers by default
m=re.search(r"GA([ATGC]{3})AC([ATCG]{2})AC",dna)
print("start "+str(m.start()))
print("end "+str(m.end()))
print("group 1 start "+str(m.start(1)))
print("group 1 end "+str(m.end(1)))
print("group 2 start "+str(m.start(2)))
print("group 2 end "+str(m.end(2)))

start 2
end 13
group 1 start 4
group 1 end 7
group 2 start 9
group 2 end 11


In [14]:
#re.split can be used to split a string on a pattern
dna="ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs=re.split(r"[^ATGC]",dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [15]:
#to find multiple matches, findall produces only a list of strings
#format is re.findall("[desired_pattern]{modifiers}",string)
dna="ACTGCATTATATCGTACGAAATTTATACGCGCG"
runs=re.findall(r"[AT]{4,}",dna)
print(runs)

['ATTATAT', 'AAATTTATA']


In [16]:
#re.finditer returns a sequence of match objects (can be used in loops to extract match object info)
runs=re.finditer(r"[AT]{3,}",dna)
for match in runs:
    run_start=match.start()
    run_end=match.end()
    print("AT rich region from "+str(run_start)+" to "+str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 27
