In [None]:
# Regexes are a foundational technique for data cleaning, 
# and you can think of them as patterns which you give to a regex processor with some source data

In [2]:
import re

In [3]:
# match() checks for a match that is at the beginning of the string and returns a boolean.
# search() checks for a match anywhere in the string and returns a boolean.

# Example
text = "This is a good day."

if re.search("good", text):
    print("Wonderful!")
else:
    print("Alas :(")

Wonderful!


In [4]:
# The findall and split() functions will parse the string for us and return chunks

#Example
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful"
re.split("Amy", text)

['', ' works diligently. ', ' gets good grades. Our student ', ' is succesful']

In [5]:
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [6]:
# The caret character ^ means start and the $ means end.

# Example
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful"
re.search("^Amy", text)

# The search() function returns a Match object, which tells us what pattern was matched ("Amy") 
# and the location the match was in (span)

<re.Match object; span=(0, 3), match='Amy'>

# Patterns and Character Classes

In [7]:
grades = "ACAAAABCBCBAA"
re.findall("B", grades)

['B', 'B', 'B']

In [14]:
# We can search for different elements by putting them all inside suare brackets (set operator).
# The set operator always does a character-based matching.
# Example: look for A or B
print(re.findall ("[AB]", grades))
# This is equivalent to use the or operator
print(re.findall("A|B", grades))

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']
['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']


In [12]:
# You can also include a range of characters:
# We can use [a-z] for lower letters
# We can use [A-Z] for capital letters
# We can use [\w] to match any letter, including digits
# We can use [\d] for digits
# We can use . for any single character which is not a newline
# We can use [\s] to match any whitespace character

# Example: look for A followed by a B or a C
re.findall("[A][B-C]", grades)

['AC', 'AB']

In [13]:
# look for A followed by a B and a C
re.findall("[A]BC", grades)

['ABC']

In [15]:
# Square brackets with a ^ are equivalent to the NOT operator
re.findall("[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B']

# Quantifiers

In [16]:
# Quantifiers ae the number of times you want a pattern to be matched in order to match. 
# The most basic quantifier has the form: character{minimum times, maximum times}. The default value is {1,1}
# If we put just one number, it becomes both the maximum and minimum.
### Note: If you have a space in between the braces, you will get an empty result ###

# Example
re.findall("A{2,10}", grades)

# In this example, we see that there are two streaks, one of four A's and another one of two A's

['AAAA', 'AA']

In [17]:
# We look for two A's followed immediately by two more A's.
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA']

In [None]:
# There are three other quantifiers that are used as short hand:
# An * to match 0 or more times
# A ? to match 1 or more times
# A + to match 1 or more times

# Groups

In [20]:
# Groups allow us to match different patterns at the same time. To group patterns together, we can use ()
re.findall("([A])([B-C])", grades)

[('A', 'C'), ('A', 'B')]

In [27]:
# As findall() returns strings and search() and match() return individual Match objects, 
# finditer() returns lists of Match Objects.
for item in re.finditer("([A])([B-C])", grades):
    print(item.group(0))
    print(item.group(1))
    print(item.group(2))

AC
A
C
AB
A
B


In [34]:
# We can also label each group and build a dictionary. For that e use the syntax(?P<name>)
for item in re.finditer("(?P<first_grade>[A])(?P<second_grade>[B-C])", grades):
    print(item.groupdict()['second_grade'])

C
B


# Look-ahead and Look-behind

In [35]:
# We can look ahead of a determined element with the ?= syntax.
for item in re.finditer("(?P<first_grade>[A])(?=[B-C])", grades):
    print(item)

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(5, 6), match='A'>


In [41]:
# We can look behind of a determined element with the =? syntax.
for item in re.finditer("(?P<first_grade>[A])(=?[B-C])", grades):
    print(item)

<re.Match object; span=(0, 2), match='AC'>
<re.Match object; span=(5, 7), match='AB'>


In [38]:
# We can use re.VERBOSE to make it easier to understand large regexes

pattern="""
(?P<first_grade>[A])
(?P<second_grade>[B-C])"""

for item in re.finditer(pattern, grades, re.VERBOSE):
    print(item)

<re.Match object; span=(0, 2), match='AC'>
<re.Match object; span=(5, 7), match='AB'>


In [96]:
def URL_match(string):
    print(re.findall("[\w*].[^[^.][\w*].*]*", string))

In [98]:
URL_match("abc..com")

[]
