# Manipulating Text with Regular Expression

In [1]:
#Now to manipulate text we are going to use this library
import re

In [2]:
#lets take an example
text = 'This is a good day'
#to make sure it really is a good day
if re.search('good',text):
    print("Thats lit")
else: print("Alas :(")


Thats lit


In [3]:
#lets take other example
text = 'Amy works diligently. Amy gets good grades. Our student Amy is succesful.'
re.split("Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [4]:
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [5]:
#To identify some patterns we can use the character ^, 
#so putting it before a word, it will sought patterns which begins
#by this word, for instance
re.search("^Amy", text)

<re.Match object; span=(0, 3), match='Amy'>

In [86]:
#Moreover, using $ character it will sought patterns which ends so
#,e.g.,
re.search("Amy$", text)

## Patterns and Character

In [7]:
#For example
grades = "ACAAAABCBCBAA"
#Then, how many B's were in the grade list?
re.findall("B", grades)

['B', 'B', 'B']

In [8]:
#or, how many A's and B's were in the grade list?
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']

In [9]:
#or also, how many A's followed by a B or a C were?
re.findall("[A][B-C]",grades)

['AC', 'AB']

In [10]:
#this also can be done as
re.findall("AB|AC", grades)

['AC', 'AB']

In [11]:
#to find out the grades that were not A's
re.findall("[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B']

In [12]:
re.findall("[A$]", grades)

['A', 'A', 'A', 'A', 'A', 'A', 'A']

In [13]:
#and find out matches that not begins with A
re.findall("^[^A]", grades)
#There's no match because our string begins with A

[]

## Quantifiers

In [14]:
# We also can obtain certain amont of matches using e{m,n}, where
# e is the pattern and m is the minimum number of matches and 
# n the maximum, so
re.findall("A{2,10}", grades)

['AAAA', 'AA']

In [15]:
re.findall("A{2,3}", grades)

['AAA', 'AA']

In [16]:
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA']

In [17]:
re.findall("A{2,2}", grades)

['AA', 'AA', 'AA']

In [18]:
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [19]:
re.findall("A{1,10}B{1,10}C{1,10}", grades)

['AAAABC']

### Reading Text

In [20]:
with open("Datasets/ferpa.txt", "r") as file:
    #To read into the text
    wiki = file.read()
wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [21]:
# In this text behind each header appears the pattern [edit], so
re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)
# so we get the headers we were looking for

['Overview[edit]', 'records[edit]', 'records[edit]']

In [22]:
# However, the latter just take into account letters not numbers,
# to solve this we use the metacharacter \w, so
re.findall("[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [23]:
# This can be shorter replacing the curly brackest by *, i.e.,
re.findall("[\w]*\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [24]:
# To add more word before these ones, we do
re.findall("[\w ]*\[edit\]", wiki)
# This gets us the list of section title in the wikipedia page

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [25]:
#Now, to clean this data
for title in re.findall("[\w ]*\[edit\]", wiki):
    print(re.split("[\[]", title)[0])

Overview
Access to public records
Student medical records


## Groups

In [26]:
# We also can groups patterns into a tuple, i.e.,
re.findall("([\w ]*)(\[edit\])", wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [27]:
# to separate each tuple
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [28]:
# and to separate each header
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


## Example: Wikipedia Data

In [29]:
with open("Datasets/buddhist.txt","r") as file:
    wiki = file.read()
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [30]:
# Verbose modo in python regex
pattern = """
(?P<title>.*)          #the  university title
(–\ located\ in\ )     #an indicator if the location
(?P<city>\w*)          #city the university is in
(,\ )                  #separator for the state
(?P<state>\w*)         #the state the city is located in """

for item in re.finditer(pattern,wiki,re.VERBOSE):
    print(item.groupdict())

{'title': 'Dhammakaya Open University ', 'city': 'Azusa', 'state': 'California'}
{'title': 'Dharmakirti College ', 'city': 'Tucson', 'state': 'Arizona'}
{'title': 'Dharma Realm Buddhist University ', 'city': 'Ukiah', 'state': 'California'}
{'title': 'Ewam Buddhist Institute ', 'city': 'Arlee', 'state': 'Montana'}
{'title': 'Institute of Buddhist Studies ', 'city': 'Berkeley', 'state': 'California'}
{'title': 'Maitripa College ', 'city': 'Portland', 'state': 'Oregon'}
{'title': 'University of the West ', 'city': 'Rosemead', 'state': 'California'}
{'title': 'Won Institute of Graduate Studies ', 'city': 'Glenside', 'state': 'Pennsylvania'}


## Regex Practice Section

In [121]:
# Match word which begins with vowel letter but ends with no vowel letter
words = "unicorn, element, banana, apple"
re.findall("([aeiou][a-z]*[^aeiou])" , words)

['unicorn,', 'element,', 'anana,', 'appl']

In [77]:
# Match octal and hexadecimal number
unities = "0o112, 0o237, 0o07, 0xf3, 0x1d, 0x072, 1233, 0o12f"
re.findall("0o[0-7]+|0x[0-9a-f]+", unities)

['0o112', '0o237', '0o07', '0xf3', '0x1d', '0x072', '0o12']

In [111]:
# Match Firstname and Lastname in the beginning of a frase
names = "Jane Doe is eating breakfast. L Zhang is a great Broadway actor. Birthday Jhon Doe's Today"
re.findall("^[A-Z][a-z]* [A-Z][a-z]*", names)

['Jane Doe']

In [214]:
# Match price like $3.45 or $23.32 or $400, but not match price like $.23 or $400.1
price = "$3.45 or $23.32 or $40 or $.23 or $400.1"
re.findall("((?:\$\d+)(?:\.\d{2})?)", price)

['$3.45', '$23.32', '$40', '$400']

In [235]:
# Match email address
emails = "abc@umich.edu, 8ab.c_def9@example.regex.com, abc@ def., ab..abc@def.com, abc@def"
re.findall("\w+\.*\w+@\w+\.\w+\.*\w*", emails)

['abc@umich.edu', '8ab.c_def9@example.regex.com', 'ab..abc@def.com']

In [241]:
# Match URls
urls = "www.aBC.com, abc.com, ab_c.de8f.com, abc, abc..com"
re.findall("[www\.]*\w*\w+\.\w+", urls)

['www.aBC.com', 'abc.com', 'ab_c.de8f']

In [247]:
# Match ISBN
isbn = "9971-5-0210-0, 960-425-059-0"
re.findall("\d{3}-\d{1}-\d{4}-\d{1}|\d{3}-\d{3}-\d{3}-\d", isbn)

['971-5-0210-0', '960-425-059-0']

In [256]:
# Match a DOI registered by Crossref
doi = "doi:10.1038/nphys1170, doi:10.1002/0470841559.ch1"
re.findall("^doi:[0-9]+\.+[0-9]+\/\w+\.*\w*", doi)

['doi:10.1038/nphys1170']

# Quiz 1

In [260]:
string = "bat, bot, bit, bet"
result = re.findall("b[ao]t", string)
print(result)

['bat', 'bot']


In [272]:
s = 'ACAABAACAAAB'
re.findall("A{1,2}", s)

5

In [271]:
domain = 'I refer to https://google.com and I never refer http://www.baidu.com if I have to search anything'
re.findall("(?<=[https]:\/\/)([A-Za-z0-9.]*)", domain)

['google.com', 'www.baidu.com']