# Advanced RegEx

In [1]:
import re

#### Repitition: searcing for basic patterns

In [2]:
text = "That dog is very fluffy."

In [5]:
pattern = 'f'

re.findall(pattern, text)

['f', 'f', 'f']

In [6]:
re.match(pattern, text)

In [8]:
# re.match returns no output because the text doesnt start with the pattern. 
re.match('T', text) is None

False

In [9]:
re.match('That', text)

<re.Match object; span=(0, 4), match='That'>

In [12]:
# what if you want to know if the text starts with an uppercase letter?
re.match('[A-Z]', text)

# this one will probably be useful for web scraping to identify which specific lines or table rows to keep

<re.Match object; span=(0, 1), match='T'>

In [13]:
pattern = 'uf'
re.findall(pattern, text)

['uf']

In [14]:
pattern = '[uf]' # with [] this is now the SET of the letter 'u' and the letter 'f', no longer the occurance of 'uf'.
re.findall(pattern, text)

['f', 'u', 'f', 'f']

### [ ] Match set of characters

In [15]:
text2 = "Is it spelled gray or grey?"

pattern = 'gr[ae]y' # this will allow you to find both spelling varieties in the text, rather than searching for them individually.

re.findall(pattern, text2)

['gray', 'grey']

In [18]:
text3 = "This is an A and B conversation, so C your way out of it you D!."

# find just A, B, and C:

re.findall('[A-C]', text3)

['A', 'B', 'C']

### . Match any character except newline (\n)

In [20]:
text = "That dog is very fluffy."
pattern = '.' # This dot is a special character! You will get EVERYTHING expect the linebreak. 
print(re.findall(pattern, text))

# you will get back every element separately! 

['T', 'h', 'a', 't', ' ', 'd', 'o', 'g', ' ', 'i', 's', ' ', 'v', 'e', 'r', 'y', ' ', 'f', 'l', 'u', 'f', 'f', 'y', '.']


In [23]:
text4 = "That dog is very fluffy. \nIt also has floppy ears." # use the \n to tell python you want it to be printed on a new line!
print(text4)

That dog is very fluffy. 
It also has floppy ears.


In [25]:
print(re.findall(pattern, text4)) # note that the '.' WILL NOT find the \n 

['T', 'h', 'a', 't', ' ', 'd', 'o', 'g', ' ', 'i', 's', ' ', 'v', 'e', 'r', 'y', ' ', 'f', 'l', 'u', 'f', 'f', 'y', '.', ' ', 'I', 't', ' ', 'a', 'l', 's', 'o', ' ', 'h', 'a', 's', ' ', 'f', 'l', 'o', 'p', 'p', 'y', ' ', 'e', 'a', 'r', 's', '.']


### ^ Match characters not listed if within set OR match 

In [26]:
pattern =  '[^a-z]' # will match everything EXCEPT what is listed there. 
# You will find everything EXCEPT lowercase letters from a-z

print(re.findall(pattern, text))


['T', ' ', ' ', ' ', ' ', '.']


In [30]:
pattern = '[^a-z0-9! ]' # note the empty space included in the set
text = "That dog is very fluffy.1 2 3 4 \n !!"
print(re.findall(pattern, text))

['T', '.', '\n']


In [32]:
pattern = '^That dog' # what happens if you put the carrot not inside of a set? 
print(re.findall(pattern, text))

# it will turn your findall into a de facto MATCH! And look for something at the beginning of the text.

['That dog']


In [33]:
pattern = '^fluffy'
print(re.findall(pattern, text))

# this is empty because 'fluffy' is not at the beginning of the line!

[]


### $ Match end of line

In [34]:
pattern = 'That dog$'

print(re.findall(pattern, text))

[]


In [37]:
pattern = 'fluffy.$'
text = "That dog is very fluffy."
print(re.findall(pattern, text))

['fluffy.']


### | Functions as an "OR" operator

In [38]:
pattern = 'fluffy|dog'  # here you have this OR operator to find all occurances of 'fluffy' and 'dog'! 
print(re.findall(pattern, text))

['dog', 'fluffy']


### * Matches previous character 0 or more times

In [40]:
dogtext = "og dog dddog ddog ddoooooggggg ddogggg daaggggg deg dog"
pattern = 'd*og' # this will match the character before the * 0 or more times. 
print(re.findall(pattern, dogtext))

# it will find only the instances with multiple ds, but not mutliple os or gs because there is only one o and one g in the special pattern

['og', 'dog', 'dddog', 'ddog', 'og', 'ddog', 'dog']


### + Matches previous character 1 or more times

In [41]:
pattern = 'do*g'
print(re.findall(pattern, dogtext))

# how to do this with at least one occurance, and not account for 0 times:

['dog', 'dog', 'dog', 'dooooog', 'dog', 'dog']


In [44]:
pattern = 'd+og'
print(re.findall(pattern, dogtext))

# you will no longer match the 'og' beacause you need at least one d

['dog', 'dddog', 'ddog', 'ddog', 'dog']


### ? Matches previous character 0 or 1 

In [45]:
pattern = 'd?og'
print(re.findall(pattern, dogtext))

['og', 'dog', 'dog', 'dog', 'og', 'dog', 'dog']


### {num1,num2} Matches previous characters num1-num2 times

In [66]:
pattern = 'd{2,3}og'
print(re.findall(pattern, dogtext))

['dddog', 'ddog', 'ddog']


In [67]:
# this will return everything with between 2-3 ds and then og
# you could also specify that you want a d maximum 3 times

pattern = 'd{,3}og'
print(re.findall(pattern, dogtext))

['og', 'dog', 'dddog', 'ddog', 'og', 'ddog', 'dog']


In [68]:
# similarly, you can say you want at least 2 ds:

pattern = 'd{2,}og'
print(re.findall(pattern, dogtext))

['dddog', 'ddog', 'ddog']


In [None]:
# this is really helpful for example to NOT extract words like 'a', 'an', 'the' or other things that make analysis hard. 
# you would use this {} as away to make a condition! See the words of at least 4 letters example below

### Challenge: Extract all the words from the following sentance

In [48]:
cattext = "Every cat has nine lives."

#\w: any alphanumeric characters!!

In [61]:
cattext.split()

['Every', 'cat', 'has', 'nine', 'lives.']

In [63]:
print(re.findall('\w', cattext))

['E', 'v', 'e', 'r', 'y', 'c', 'a', 't', 'h', 'a', 's', 'n', 'i', 'n', 'e', 'l', 'i', 'v', 'e', 's']


In [57]:
print(re.findall('\w+', cattext)) # this looks for collections of numbers 

['Every', 'cat', 'has', 'nine', 'lives']


In [62]:
print(re.findall('\w*', cattext)) # note that this prints it with the empty spaces, because it is 0 or more rather then at least 1

['Every', '', 'cat', '', 'has', '', 'nine', '', 'lives', '', '']


### Now extract only words that have at least 4 letters!

In [70]:
pattern = '\w{4,}'

print(re.findall(pattern, cattext))

['Every', 'nine', 'lives']


### Extract all the phone numbers from the following text

In [80]:
text = '''
Aeromexico 800-237-6639
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6566
Air Inuit 800-361-2965
Air North 800-661-0407
'''

In [81]:
# find list with JUST the phone numbers:

print(re.findall('\d{3,}-\d{3,}-\d{4,}', text))

['800-237-6639', '888-247-2262', '888-247-2262', '800-567-6566', '800-361-2965', '800-661-0407']


In [83]:
print(re.findall('\d+-\d+-\d+', text)) 

['800-237-6639', '888-247-2262', '888-247-2262', '800-567-6566', '800-361-2965', '800-661-0407']


In [None]:
# note: for the credit card validator: if you need 4 numbers - 4 numbers - 4 numbers:
# you could do re.findall'\d\d\d\d-\d\d\d\d-\d\d\d\d'
# or re.findall'\d{4,4}-\d{4,4}-\d{4,4}' 