# Regular Expressions in Pandas

See Chapter 7 of "Python for Data Analysis" by Wes McKinney

In [1]:
import pandas as pd
import numpy as np
import re

The re module handles
    pattern matching
    splitting strings
    splitting
    
"\s+" is regex coding for one or more white space characters

We did some work with regex in DAT 511 and there are other guides for regex available, this is just some quick listings of how to get started with them in Python

In [2]:
# here is a string split on multiple white spaces

import re

text = "foo    bar\t baz  \tqux"

re.split('\s+', text)


['foo', 'bar', 'baz', 'qux']

In [3]:
# a compiled operation (function) based on a re fuction, 

regex = re.compile('\s+')         

regex.split(text)
['foo', 'bar', 'baz', 'qux']

['foo', 'bar', 'baz', 'qux']

In [8]:
regex.split("The quick brown fox jumped over the lazy dog")

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

In [9]:
#show all the detected objects defined in the detection of white space
regex.findall(text)

['    ', '\t ', '  \t']

In [13]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""


#this is a pattern designed to detect e-mail addresses,  a fancy bit of regex coding 

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [11]:
# locate and extract these patterns
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [12]:
# substitution for this pattern
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [14]:
# search for the pattern,  locates the first occurrence

m = regex.search(text)
print(m)
text[m.start():m.end()]

<re.Match object; span=(5, 20), match='dave@google.com'>


'dave@google.com'