In [1]:
# Import the regular expressions library re
import re

In [2]:
# The function match() checks for a match at the start of the
# string and return a boolean.

# The function search() checks for a match anywhere in the string 
# and returns a boolean. 

# Let's see if it's a good day
text = 'This is a good day'

if re.search('good', text):
    print('Wonderful')
else:
    print("Alas :_(")

Wonderful


In [3]:
# We can also use regex to segment strings based on some pattern,
# a process called tokenizing

# The findall() and split() functions are used to return chunks of 
# text. 
text = "Julia works diligently. Julia gets good grades. Our student Julia is succesful."

# Let's split all instaces of Julia
re.split("Julia", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [4]:
# split() returned an empty string, followed by several statements
# about Julia, all as elements of a list.append
# To find out how many times was Julia mentioned we nee to use findall()
print(re.findall('Julia', text))
len(re.findall('Julia', text)) 

['Julia', 'Julia', 'Julia']


3

In [5]:
# In short, search() looks for a pattern and returns a boolean
# split() uses a pattern to create a list of substrings 
# findall() looks for a pattern and extract all occurences 

In [6]:
# For more complex patterns we'll need to make use of Markup
# language. 

# Anchors specify the start and/or end of the wanted string;
# The symbol ^ means start, $ means end. 
# Putting ^ before a string means it must start with said string
# Putting $ after a string means it must end with said string
text = "Julia works diligently. Julia gets good grades. Our student Julia is succesful."

# Does it begin with Julia?
re.search('^Julia', text)

<re.Match object; span=(0, 5), match='Julia'>

In [7]:
# The returned object, called a re.Match object, always has a
# boolean value of True, and so it can be used in if-statements

### Patterns and character classes

In [8]:
# Let's create a string of grades given across a semester
grades = 'ACAAAABCBCBAA'

# To get the numbers of B's obtained we just need to put B as pattern
print(re.findall('B',grades))

# Getting the number of both A's and B's it's a bit more tricky, we 
# put the desired pattern inside []'s 
print(re.findall('[AB]',grades))

['B', 'B', 'B']
['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']


In [9]:
# This is called a set operator
# You can also include a range of alphanumerically ordered characters
# e.g [a-z] the range of all lowercase letters

# Let's extract all cases received first an A and then a B or a C
re.findall('[A][B-c]', grades)

['AC', 'AB']

In [10]:
# So [AB] denots a single pattern set of possible characters A OR B (inclusive OR),
# while [A][B-C] denotes two pattern sets of possible charaters, A AND B OR C

# For the manual version we use the pipe operator | for OR
print(re.findall('AB|AC', grades))

# We can use this for complex pattern like (A OR B) OR (B OR C)
print(re.findall('[AB]|[BC]', grades))

['AC', 'AB']
['A', 'C', 'A', 'A', 'A', 'A', 'B', 'C', 'B', 'C', 'B', 'A', 'A']


In [11]:
# We can also use [] as a negation

# Get all grades different from A
re.findall('[^A]',grades)

['C', 'B', 'C', 'B', 'C', 'B']

In [12]:
# As shown, inside [] symbols like ^ aquire a different 
# meaning.

# Since [^A] means return not-A's, ^[^A] means return a string
# at the beginning of the string which is not an A. Since [] works
# by matching individual characters in an OR method [^A] --> ^A OR empty.
# Since the string starts with A, it returns an empty list. 
re.findall('^[^A]', grades)

[]

### Quantifiers

In [13]:
# A quantifier is used to set the numbers of times a pattern is
# to be matched.
# The simplest quantifier is e{m,n}, where "e" is the pattern,
# m the minimum # of times it should be matched and n the maximum

# How many streak of A's does Julia have?
re.findall('A{2,10}',grades)

['AAAA', 'AA']

In [14]:
# We can also do it by setting a single match and repeating it

#Find how many streaks of two A are there
re.findall('A{1,1}A{1,1}',grades) 

['AA', 'AA', 'AA']

In [15]:
# Regex does not allow deviaton from the e{m,n} pattern, so
# adding a space like so e{m, n} will return an empty list
re.findall('A{2, 10}', grades)

[]

In [16]:
# e{m,m} is equal to e, i.e to not using a quantifiers, 
# and so e{m,m}e{m,m} = ee
print(re.findall('AA',grades))

# Similarly, e{m} = e{m,m}
print(re.findall('A{1}A{1}',grades))

['AA', 'AA', 'AA']
['AA', 'AA', 'AA']


In [17]:
# We could use this to e.g find a decreasing trend in grades
re.findall('A{1,10}B{1,10}C{1,10}', grades)

['AAAABC']

In [18]:
# This is a bit of a hack, since the maximum value selected
# is arbitrary.
# We can use symbols as shorthands for {,m,n}: * to match 0 or more times
# ? to match 0 to 1 times, and + to match +1 times   

# Let's see an example using data scraped from wikipedia
with open('datasets/ferpa.txt', 'r') as file:
    wiki = file.read()

wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [19]:
# Notice that all the headers have the word [edit], followed
# by a new line character. 

# With this info, we can extract all the headers

# Extract up to 100 characters, lowercase a-z or uppercase A-Z,
# as long as they're followed by [edit]
re.findall('[a-zA-Z]{1,100}\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [20]:
# This only partially worked: it only got the last word
# of the header and the pattern is kinda clunky.
# We can use metacharacters,that indicate certain patterns
# of letters and/or digit, to improve it
# e.g \s matches any whitespace character

# We can use \w to match any letter including digits and numbers
# and condense the pattern
print(re.findall('[\w]{1,100}\[edit\]', wiki))

# We can condense it even more by using * to match any number of times
print(re.findall('[\w]*\[edit\]', wiki))

# To now improve it, let's allow for spaces after words
print(re.findall('[\w ]*\[edit\]', wiki))

['Overview[edit]', 'records[edit]', 'records[edit]']
['Overview[edit]', 'records[edit]', 'records[edit]']
['Overview[edit]', 'Access to public records[edit]', 'Student medical records[edit]']


In [21]:
# To create a list of titles minus [edit], we iterating 
# through this regex and create another one using split()
# to return a list of values minus the input.
for title in re.findall('[\w ]*\[edit\]', wiki):
    print(re.split('[\[]', title)[0])

Overview
Access to public records
Student medical records


### Groups

In [23]:
# So far we've centered around single patterns, but it's possible
# to group several patterns together and call them when needed
# We do this by using ()
re.findall('([\w ]*)(\[edit\])', wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [26]:
# Notice that the re library breaks the results by groups; these
# groups and their associated matches can be referred to by number

# But how do we get a list of said match objects? So far we can only
# return strings with findall() and match individual objects with 
# search() and match()

# To get a tuple of match objects we use the function finditer()
for item in re.finditer('([\w ]*)(\[edit\])', wiki):
    print(item.groups())

print()

# We can get a specific group using group(number), where number(0)
# is the whole match and the rest represent the different portions
for item in re.finditer('([\w ]*)(\[edit\])', wiki):
    print(item.group(1))

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')

Overview
Access to public records
Student medical records


In [47]:
# We can also label the groups and refer to them with it
# To do this we write (?P<name>), which creates a dictionary
# () indicate it's a group, ?P means it's an extension to
# basic regex, and <name> is the dictionary key
for item in re.finditer('(?P<header>[\w ]*)(?P<edit_link>\[edit\])', wiki):
    print(item.group('header'))

print()

# We can get the result as a dictionary by using groupdict()
for item in re.finditer('(?P<header>[\w ]*)(?P<edit_link>\[edit\])', wiki):
    print(item.groupdict())

Overview
Access to public records
Student medical records

{'header': 'Overview', 'edit_link': '[edit]'}
{'header': 'Access to public records', 'edit_link': '[edit]'}
{'header': 'Student medical records', 'edit_link': '[edit]'}


In [48]:
# Other common metacharacters like \w are:
# a . for any single character which not a newline
# a \d for any digit
# a \s for any whitespace characters, e.g spaces and tabs

### Look-ahead and Look-behind

In [44]:
# These matching methods lets us define a pattern to
# isolate text by indicating the text right before or after it

# In the header example, we wanted the text before [edit] but didn't
# care about [header] itself, and so we filtered it away
# A more direct method would be to put it in a group and then use look-ahead 
# We do this by using the ?= syntax
for item in re.finditer('(?P<headers>[\w ]+)(?=\[edit\])', wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


### Example: Wikipedia data

In [62]:
# Let's explore the data about US buddhist-based universities
with open('datasets/buddhist.txt','r', encoding='utf8') as file:
    wiki = file.read()
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [66]:
# Notice the pattern after the mention of a university
# An '-' followed by 'located in [city], [state]'

# To increse the readability of our regex and use multi-line
# regexes we can make use of the so-called verbose mode
# This is done by explicitly indicating all whitespace characters
# either by prepending them with \ or by using the metacharacter \s
# This makes writing regex similar to coding, and you can even comment with #
pattern ="""
(?P<title>.*)        #the university title
(–\ located\ in\ )   #an indicator of the location
(?P<city>\w*)        #city the university is in
(,\ )                #separator for the state
(?P<state>\w*)       #the state the city is located in"""

# When calling finditer() we just need to pass the re.VERBOSE flag
# as the last parameter
for item in re.finditer(pattern, wiki, re.VERBOSE):
    print(item.group())

Dhammakaya Open University – located in Azusa, California
Dharmakirti College – located in Tucson, Arizona
Dharma Realm Buddhist University – located in Ukiah, California
Ewam Buddhist Institute – located in Arlee, Montana
Institute of Buddhist Studies – located in Berkeley, California
Maitripa College – located in Portland, Oregon
University of the West – located in Rosemead, California
Won Institute of Graduate Studies – located in Glenside, Pennsylvania


### New York Times and hashtags

In [85]:
# Dataset of NYT tweets about health realted news from 
# UC Irvine machine learning repository
with open('datasets/nytimeshealth.txt', 'r', encoding='utf8') as file:
    nytimes = file.read()
    

In [96]:
# Let's find the number of hashtags in this data
# Hashtags begin with # and continues until a blankspace
pattern = '(#[\w\d]*)(?=\s)'

for item in re.finditer(pattern, nytimes):
    print(item.group())

print()

print("There are",len(re.findall(pattern, nytimes)),"tweets")

#askwell
#pregnancy
#Colorado
#VegetarianThanksgiving
#FallPrevention
#Ebola
#Ebola
#ebola
#Ebola
#Ebola
#EbolaHysteria
#AskNYT
#Ebola
#Ebola
#Liberia
#Excalibur
#ebola
#Ebola
#dallas
#nobelprize2014
#ebola
#ebola
#monrovia
#ebola
#nobelprize2014
#ebola
#nobelprize2014
#Medicine
#Ebola
#Monrovia
#Ebola
#smell
#Ebola
#Ebola
#Ebola
#Monrovia
#Ebola
#ebola
#monrovia
#liberia
#benzos
#ClimateChange
#Whole
#Wheat
#Focaccia
#Tomatoes
#Olives
#Recipes
#Health
#Ebola
#Monrovia
#Liberia
#Ebola
#Ebola
#Liberia
#Ebola
#blood
#Ebola
#organtrafficking
#EbolaOutbreak
#SierraLeone
#Freetown
#SierraLeone
#ebolaoutbreak
#kenema
#ebola
#Ebola
#ebola
#ebola
#Ebola
#ASMR
#AIDS2014
#AIDS
#MH17
#benzos

There are 75 tweets
