# Regular expressions (Regex)

In [1]:
import re

### Guía rápida

- Before you can use regular expressions in your program, you must import the library using "import re"
- You can use re.search() to see if a string matches a regular expression, similar to using the find() method for strings
- You can use re.findall() to extract portions of a string that match your regular expression similar to a combination of find() and slicing: var[5:10] 

In [2]:
x = "asdf"

In [3]:
x.find("s")

1

### Búsqueda de patrones

In [4]:
seq0 = "AAACCCTTTGGG"
seq1 = "AAGCGTTGGG"

In [5]:
pat = "GTT"

In [6]:
match = re.search(pat, seq0)

In [7]:
match

In [8]:
print(match is None)

True


In [9]:
match = re.search(pat, seq1)

In [10]:
print(match is None)

False


In [11]:
match.start()

4

In [12]:
match.end()

7

In [13]:
match.group()

'GTT'

Cheat sheet: https://www.debuggex.com/cheatsheet/regex/python

In [14]:
s = "purple alice-b@google.com max joe leo@uchicago.edu"

In [15]:
re.search("\w+@\w+", s).group()

'b@google'

In [16]:
re.search("\w+@\w", s).group()

'b@g'

In [17]:
re.search("\w@\w+", s).group()

'b@google'

In [18]:
re.search("[\w-]+@[\w.-]+\w", s).group()

'alice-b@google.com'

### Group matching

In [19]:
m = re.search("([\w-]+)@([\w.-]+\w)", s)

In [20]:
m.groups()

('alice-b', 'google.com')

In [21]:
m.group(0)

'alice-b@google.com'

In [22]:
m.group(1)

'alice-b'

In [23]:
m.group(2)

'google.com'

In [24]:
re.findall("([\w-]+)@([\w.-]+\w)", s)

[('alice-b', 'google.com'), ('leo', 'uchicago.edu')]

Usualmente lo que queremos es usar "findall".

In [26]:
text = open("bible.txt").read()

In [27]:
pat = "([a-zA-Z]+) loved ([a-zA-Z]+)"

In [28]:
re.findall(pat, text)

[('he', 'her'),
 ('Isaac', 'Esau'),
 ('Rebekah', 'Jacob'),
 ('Jacob', 'Rachel'),
 ('he', 'also'),
 ('he', 'the'),
 ('Israel', 'Joseph'),
 ('father', 'him'),
 ('he', 'thy'),
 ('LORD', 'you'),
 ('God', 'thee'),
 ('he', 'the'),
 ('he', 'a'),
 ('he', 'Hannah'),
 ('he', 'him'),
 ('Jonathan', 'him'),
 ('he', 'him'),
 ('Judah', 'David'),
 ('daughter', 'David'),
 ('daughter', 'him'),
 ('he', 'him'),
 ('he', 'him'),
 ('he', 'his'),
 ('LORD', 'him'),
 ('David', 'her'),
 ('had', 'her'),
 ('Solomon', 'the'),
 ('LORD', 'Israel'),
 ('Solomon', 'many'),
 ('hath', 'his'),
 ('God', 'Israel'),
 ('Rehoboam', 'Maachah'),
 ('he', 'husbandry'),
 ('king', 'Esther'),
 ('I', 'are'),
 ('have', 'the'),
 ('he', 'cursing'),
 ('have', 'thee'),
 ('hath', 'him'),
 ('have', 'strangers'),
 ('they', 'to'),
 ('have', 'thee'),
 ('hast', 'a'),
 ('I', 'him'),
 ('have', 'you'),
 ('thou', 'us'),
 ('I', 'Jacob'),
 ('him', 'him'),
 ('she', 'much'),
 ('so', 'the'),
 ('men', 'darkness'),
 ('Jesus', 'Martha'),
 ('he', 'him'),
 ('t

In [29]:
pat = "([A-Z][a-zA-Z]*) loved ([A-Z][a-zA-Z]*)"

In [30]:
re.findall(pat, text)

[('Isaac', 'Esau'),
 ('Rebekah', 'Jacob'),
 ('Jacob', 'Rachel'),
 ('Israel', 'Joseph'),
 ('Judah', 'David'),
 ('LORD', 'Israel'),
 ('God', 'Israel'),
 ('Rehoboam', 'Maachah'),
 ('I', 'Jacob'),
 ('Jesus', 'Martha')]

---

In [31]:
x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[0-9]+', x)
y

['2', '19', '42']

In [32]:
y = re.findall('[AEIOU]+', x)
y

[]

---

In [33]:
x = 'From: Using the : character'
y = re.findall('^F.+:', x)
y


['From: Using the :']

In [34]:
x = 'From: Using the : character'
y = re.findall('^F.+?:', x)
y


['From:']

"Greedy" vs "Non-greedy"

---

In [35]:
x = "From: stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008"
y = re.findall('\S+@\S+',x)
y

['stephen.marquard@uct.ac.za']

In [36]:
y = re.findall('^From: (\S+@\S+)', x)
y

['stephen.marquard@uct.ac.za']

---

In [37]:
y = re.findall('@([^ ]+)', x)
y

['uct.ac.za']

In [None]:
y = re.findall('^From: .*@([^ ]*)', x)
y

---

In [None]:
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
print(y)

A veces queremos buscar caracteres especiales.

---

http://regexr.com/

http://www.regular-expressions.info/

http://en.wikipedia.org/wiki/Regular_expression