# Regular Expressions 

In [1]:
# Import regular expression module

import re
import numpy as np
import math

In [None]:
# match() checks for a match that is at the beginning of the string and returns a boolean  
# search() checks for a match anywhere in the string and returns a boolean

text = "This is just a test"

if re.search("test", text):
    print("Found")
else:
    print("Not found")


In [None]:
# In addition to checking for conditionals, we can segment string. The work that regex does is here is called tokenizing, 
# where the string is separated into substrings based on patterns. Tokenizing is a core activity in natural language 
# processing. 

# The findall() and split() functions will parse the string for us and return chunks.

text = "Today, I am supposed to be studying for my CS361 exam."

re.split("CS361", text)

In [5]:
# If we want to count how many times CS361 was used in the string, we can use findall()

re.findall("CS361",text)



NameError: name 'text' is not defined

In [None]:
#Complex patterns
# The regex specification standard defines a markup language to describe patterns in text. Let's start with anchors.
# Anchors specify the start and/or the end of the string that you're trying to match. 
# The caret character ^ means start and the dollar symbol $ means end. 
# If you put ^ in the start of the text, it means the regex processor must retrieve with the starting string
# you are trying to specify. For ending, you have to put $ after the string, specifying the regex
# must end with the string.

# Example
text = "Amy is diligent. Amy gets good grades. Our student amy is successful"

re.search("^Amy",text)

In [None]:
#Notice that re.search() actually returned to us a new object, called re.Match object. An re.Match object always has a boolean value of True, as something was found, so you can always evaluate it in an if statement as we did earlier. Ther rendering of the match object also tells you what pattern was matched, in this cas Amy. and the location the match was in as the span.


# Patterns and Character Classes

In [None]:
# example

grades = "ABCAAAAAABCAACDBB"

# If we want to count how many "B's" in the grade
re.findall("B",grades)

In [None]:
# If we wanted to count the number of A's or B's in the list, we can't use "AB" since this is used to match A followed by B. Instead we put the characters A and B inside a square brackets.

re.findall("[AB]", grades)

In [None]:
# To include a range of characters which are alphanumerically
# For instance, if we want to refer to all lower case letters we would use [a-z].

re.findall("[A][B-C]",grades)

In [None]:
# We can also write this by using pipe operator, which means OR
re.findall("AB|AC",grades)

In [None]:
# We can use the caret with the set operator to negate our results. For instance, if we want to parse out only the grades which were not A's
re.findall("[^A]",grades)

In [None]:
re.findall("^[^A]",grades)

# Is an empty list, because regex says that we want to match any value at the beginning of the string which is not an A. Our string though starts with an A, so there is no match found

# Quantifiers

In [None]:
# Quantifiers are the number of times you want a pattern to be matched in order to match. The most basic quantifiers is expressed as e{m, n} where e is the expression or character we are matching , m is the minimum number of times you want it to matched, and n is the number of times the item could be matched

re.findall("A{2,10}",grades) # min is 2 and 10 is max

In [None]:
# We can also try and do this by using single values and just repeating patterns

re.findall("A{1,1}A{1,1}",grades)

In [None]:
# It's important to note that the regex quantifier syntax does not allow you to deviate from the {m,n} pattern. In particular, if you have an extra space in between the braces you'll get an empty result.

re.findall("A{1, 2}",grades)

In [None]:
# And if we don't include a quantifier, the default is {1,1}
re.findall("AA",grades)

In [None]:
# And if there is only one number inside the quantifier, it's considered to be both m and n

re.findall("A{2}",grades)

In [None]:
# Using this we could find a decreasing trend in a student's grades

re.findall("A{1,10}B{1,10}C{1,10}",grades)

In [1]:
# There are three other quantifiers that are used as short hand
# an asterix * to match 0 or more times
# a question mark ? to match 1 or more times
# A + sign to match one or more times

with open("txt/ferpa.txt", "r") as file:
    # we'll read that into a variable called wiki
    wiki = file.read()
wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [4]:
# Scanning through this document, we notice that headers have all the words [edit] behind them, followed be newline character. To get all a list of all the headers in this file we could use re.findall

re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [None]:
# This method is quite clunky. Let's improve this by using \w
# \w is a metacharacter and indicates a special pattern of any letter or digit 

re.findall("[\w]{1,100}\[edit\]",wiki)

In [None]:
# There are actually a number of different metacharacters listed in the documentation. For instance, \s matches any whitespace character
# Using quantifiers to shorten up the regex we can use,

re.findall("[\w]*\[edit\]", wiki)

In [None]:
# Improve by adding space

re.findall("[\w ]*\[edit\]",wiki)

In [None]:
# So this gets us the list of a section titles in wikipedia pages. We can now create a list of titles by iterating

for title in re.findall("[\w ]*\[edit\]",wiki):
    print(re.split("[\[]",title)[0])

# Groups

In [None]:
# This works, but a little bit of pain. We can actually matched different patterns at the same time called groups. To group patterns together we use parenthesis.

re.findall("([\w ]*)(\[edit\])",wiki)

In [None]:
# We can actually refer groups by number as well with the match objects that are returned. But, how we get a list of match objects?
# we use the finditer()

for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.groups())

In [None]:
# We see here that the groups() function returns a tuple of group. We can get an individual group using group(number), where group(0) is the whole match, and each number is the portion of the match we are interested in. 

for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.group(1)) # group(0) will print the whole match, group(1) will print the first index and so on

In [None]:
# We can also put matches in dictionary format. For that we use the syntax (?P<name>), where ?P indicates that this is an extension to basic regexes, and <name> is the dictionary key we want to use wrapped in <>

for item in re.finditer("(?P<Title>[\w ]*)(?P<edit_Link>\[edit\])",wiki):
    print(item.groupdict()['Title'])
print(item.groupdict())

# Look-Ahead and Look-Behind

In [None]:
# The pattern being given to the regex engine is for text either before or after the text we are trying to isolate.
# The syntax is ?=

for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])",wiki):
    print(item)

# Example Wikipedia Data

In [2]:
# Data on universities in the US which are buddhist based
with open("txt/buddhist.txt","r",encoding="utf-8", errors='ignore') as file:
    wiki = file.read()
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [None]:
# We can see that each universities follows a fairly similar pattern, with the name followed by an -- then the words "located in" followed by the city and state

# We can use the verbose mode of python regexes. Verbose mode allows us to write multi-line regexes and increases readbility. For this mode, we have explicitly indicate all whitespace characters, either by prepending them with a \ or by using the \s special value. However, this means we can write our regex a bit more like code and can even include comments. 

# """" three quoted means we can write codes in multiple lines

pattern="""
(?P<title>.*)      #The University title key
(--\ located\ in\ ) #an indicator of the location
(?P<city>\w*)      #City the university is in
(.\ )              #separator for the state
(?P<state>\w*)     #the state the city is located in"""

#Now when we call finditer() we just pass the re.verbose flag as a parameter, this makes it much easier to understand large regexes

for item in re.finditer(pattern,wiki,re.VERBOSE):
    print(item.groupdict())

# New York Times and Hashtags

In [None]:
with open("txt/nytimeshealth.txt","r", encoding="utf-8") as file:
    health = file.read()
health

In [None]:
# So lets create a pattern. We want to include the hash sign first, then any number of alphanumeric characters. And end with some whitespace is found

pattern = '#[\w\d]*(?=\s)'
re.findall(pattern,health)

In [79]:
def l2_dist(a, b):
    result  = ((a-b)*(a-b)).sum()
    result = result ** 0.5
    return result
l2_dist(np.reshape(a,(20*20)), np.reshape(b,(20*20,1)))

NameError: name 'a' is not defined

In [94]:
old = np.array([[1, 1, 1], [1, 1, 1]])
new = old
new[0, :2] = 0
print(old)

[[0 0 1]
 [1 1 1]]
[[0 0 1]
 [1 1 1]]


In [97]:
s = 'ACAABAACAAAB'
result = re.findall('A{1,2}', s)
L = len(result)
print(L)

5


In [102]:
text=r'''Everyone has the following fundamental freedoms:
    (a) freedom of conscience and religion;
    (b) freedom of thought, belief, opinion and expression, including freedom of the press and other media of communication;
    (c) freedom of peaceful assembly; and
    (d) freedom of association.'''

pattern = '\(.\)'
print(len(re.findall(pattern,text)))


4


In [107]:
a1 = np.random.rand(4)
a2 = np.random.rand(4, 1)
a3 = np.array([[1, 2, 3, 4]])
a4 = np.arange(1, 4, 1)
a5 = np.linspace(1 ,4, 4)
print(a1.shape)
print(a2.shape)
print(a3.shape)
print(a4.shape)
print(a5)


(4,)
(4, 1)
(1, 4)
(3,)
[1. 2. 3. 4.]
