# Data Cleansing

***

## Regular Expressions

***

https://docs.python.org/3/library/re.html

https://realpython.com/regex-python/

https://developers.google.com/edu/python/regular-expressions

## Python's re module

***


In [1]:
import re

```python
\w = '[a-zA-Z0-9_]'
\W = '[^a-zA-Z0-9_]'
```

In [2]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'\W+'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', 'words', 'words', '']


In [3]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'(\W+)'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', ', ', 'words', ', ', 'words', '.', '']


In [4]:
re.split(r'\W+', 'Words, words, words.', 1)

['Words', 'words, words.']

In [5]:
re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

## Real Python

***

In [6]:
'abccba' == 'abccba'

True

In [7]:
'abccba' == 'cbaabc'

False

In [8]:
'abc' in 'cbaabc'

True

In [9]:
'cbaabc'.index('a')

2

In [10]:
'cbaabc'[2]

'a'

In [11]:
'cbaaabc'.find('aa')

2

In [12]:
s = 'foo123bar'

re.search('123', s)

<re.Match object; span=(3, 6), match='123'>

In [13]:
s[3:6]

'123'

In [14]:
s = 'foo123bar'
re.search(r'[0-9][0-9][0-9]', s)

<re.Match object; span=(3, 6), match='123'>

In [15]:
re.search(r'[0-9][0-9][0-9]', 'foo456bar')

<re.Match object; span=(3, 6), match='456'>

In [16]:
re.search(r'[0-9][0-9][0-9]', '234baz')

<re.Match object; span=(0, 3), match='234'>

In [17]:
re.search(r'[0-9][0-9][0-9]', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [18]:
print(re.search(r'[0-9][0-9][0-9]', '12foo34'))

None


In [19]:
re.search(r'[0-9]{3}', 'qux678')

<re.Match object; span=(3, 6), match='678'>

## Google for Education

***

In [20]:
import re

str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)
# If-statement after search() tests if it succeeded
if match:
  print('found', match.group()) ## 'found word:cat'
else:
  print('did not find')


found word:cat


In [21]:
string = 'aaaabaa'
pattern = r'a+'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [22]:
string = 'aaaabaa'
pattern = r'a*'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [23]:
## i+ = one or more i's, as many as possible.
match = re.search(r'pi+', 'piiig') # found, match.group() == "piii"
match

<re.Match object; span=(0, 4), match='piii'>

In [24]:
## Finds the first/leftmost solution, and within it drives the +
## as far as possible (aka 'leftmost and largest').
## In this example, note that it does not get to the second set of i's.
match = re.search(r'i+', 'piigiiii') # found, match.group() == "ii"
match

<re.Match object; span=(1, 3), match='ii'>

In [25]:
## \s* = zero or more whitespace chars
## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') # found, match.group() == "1 2   3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx') # found, match.group() == "12  3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx123xx') # found, match.group() == "123"
print(match)

<re.Match object; span=(2, 9), match='1 2   3'>
<re.Match object; span=(2, 7), match='12  3'>
<re.Match object; span=(2, 5), match='123'>


In [26]:
## ^ = matches the start of string, so this fails:
match = re.search(r'^b\w+', 'foobar') # not found, match == None
match

In [27]:
## but without the ^ it succeeds:
match = re.search(r'b\w+', 'foobar') # found, match.group() == "bar"
match

<re.Match object; span=(3, 6), match='bar'>

In [28]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print(match.group())  ## 'b@google'


b@google


## Exercise 1

***

*Remember to do these exercises in your own notebook in your assessment repository.*

Write a Python function to remove all non-alphanumeric characters from a string.

<hr style="border-top: 1px solid #001a79;" />

***

## End