In [1]:
import re

In [2]:
news_article1 = "Europe's biggest airline Ryanair grounded 420 of its flights across Europe as the 24-hour strike affected connections from airports around the continent. Rival budget carrier EasyJet was warning of severe delays and disruption."
news_article1



### `search(pattern, string, flags)`
returns a match object for the first location where the `pattern` matches anywhere in `string` or None

In [3]:
check1 = re.search("airline", news_article1)
check1

<re.Match object; span=(17, 24), match='airline'>

In [4]:
check2 = re.search("singapore", news_article1)
check2

In [5]:
check1.span()

(17, 24)

In [6]:
check3 = re.search("europe", news_article1)
check3

In [7]:
check3 = re.search("Europe", news_article1)
check3

<re.Match object; span=(0, 6), match='Europe'>

In [8]:
span3 = check3.span()
span3

(0, 6)

In [9]:
start_index3 = span3[0]
end_index3 = span3[1]
print("From Index:", start_index3, "\nTo Index:", end_index3)

From Index: 0 
To Index: 6


In [10]:
first_number_match = re.search("\d+", news_article1)
first_number_match

<re.Match object; span=(42, 45), match='420'>

In [11]:
first_number_match.span()

(42, 45)

In [12]:
#starting index
first_number_match.span()[0]

42

In [13]:
#ending index
first_number_match.span()[1]

45

In [14]:
news_article1[42:45]

'420'

In [15]:
news_article1[first_number_match.span()[0]:first_number_match.span()[1]]

'420'

In [16]:
first_number_match_span = first_number_match.span()
first_number = news_article1[first_number_match_span[0]: first_number_match_span[1]]
first_number

'420'

Use `flags` to **ignore case** (`re.I`)

In [17]:
re.search("europe", news_article1)

In [18]:
#re.I - case insensitive matching
re.search("europe", news_article1, re.I)

<re.Match object; span=(0, 6), match='Europe'>

### `match(pattern, string, flags)`
same as `search()` except that only try to match from the beginning of the string. Returns match or None

In [19]:
re.match("Europe", news_article1)

<re.Match object; span=(0, 6), match='Europe'>

In [20]:
#no match
re.match("\d+", news_article1)

Can use `flags` to handle multiline

In [21]:
new_article1_longer = """
CNN — Hundreds of flights have been canceled and thousands of passengers forced to change plans as French air traffic controllers took industrial action over pay on Friday.

Europe's biggest airline Ryanair grounded 420 of its flights across Europe as the 24-hour strike affected connections from airports around the continent. Rival budget carrier EasyJet was warning of severe delays and disruption.

Members of France's Syndicat National des Contrôleurs du Trafic Aérien, or SNCTA, the main union for traffic controllers in the country, began striking from 6 a.m. central European time (00:00 a.m. ET) Friday. Knock-on disruptions are expected to carry on into Monday.

The union said it was calling the action after negotiations stalled with the government over pay rises in line with inflation.

"After several months spent negotiating to get fair and adapted answers, SNCTA... laments the lack of concrete elements and guarantees from public authorities," it said in a statement.
"""
print(new_article1_longer)


CNN — Hundreds of flights have been canceled and thousands of passengers forced to change plans as French air traffic controllers took industrial action over pay on Friday.


Members of France's Syndicat National des Contrôleurs du Trafic Aérien, or SNCTA, the main union for traffic controllers in the country, began striking from 6 a.m. central European time (00:00 a.m. ET) Friday. Knock-on disruptions are expected to carry on into Monday.

The union said it was calling the action after negotiations stalled with the government over pay rises in line with inflation.

"After several months spent negotiating to get fair and adapted answers, SNCTA... laments the lack of concrete elements and guarantees from public authorities," it said in a statement.



### `findall(pattern, string, flags)`
returns all non-overlapping matches of `pattern` as a list of strings or tuples

In [22]:
re.search("\d+", new_article1_longer)

<re.Match object; span=(217, 220), match='420'>

In [23]:
all_nums = re.findall("\d+", new_article1_longer)
all_nums

['420', '24', '6', '00', '00']

In [24]:
news_article1



In [25]:
#get the first word token (a-zA-Z0-9_) of the string
re.findall("^\w+", news_article1)

['Europe']

In [26]:
new_article1_longer



In [27]:
re.findall("^\w+", new_article1_longer)

[]

In [28]:
#when dealing with multiline, we can use re.M to specify that we want ^ to match the start of each new line
re.findall("^\w+", new_article1_longer, re.M)

['CNN', 'Europe', 'Members', 'The']

In [29]:
from_tokens = re.findall("from \w+", new_article1_longer)
from_tokens

['from airports', 'from 6', 'from public']

In [30]:
#when using capturing group, findall will return the contents in the capturing group
from_tokens = re.findall("from (\w+)", new_article1_longer)
from_tokens

['airports', '6', 'public']

In [31]:
#when you have multiple capturing group, findall() will combine multiple capturing group together
x_of_y = re.findall("(\w+) of (\w+)", new_article1_longer)
x_of_y

[('Hundreds', 'flights'),
 ('thousands', 'passengers'),
 ('420', 'its'),
 ('Members', 'France'),
 ('lack', 'concrete')]

In [32]:
#find the first letter of every word
#need to use raw string (since \b refers to backspace)
first_letters = re.findall(r"\b(\w)", new_article1_longer)
first_letters

['C',
 'H',
 'o',
 'f',
 'h',
 'b',
 'c',
 'a',
 't',
 'o',
 'p',
 'f',
 't',
 'c',
 'p',
 'a',
 'F',
 'a',
 't',
 'c',
 't',
 'i',
 'a',
 'o',
 'p',
 'o',
 'F',
 'E',
 's',
 'b',
 'a',
 'R',
 'g',
 '4',
 'o',
 'i',
 'f',
 'a',
 'E',
 'a',
 't',
 '2',
 'h',
 's',
 'a',
 'c',
 'f',
 'a',
 'a',
 't',
 'c',
 'R',
 'b',
 'c',
 'E',
 'w',
 'w',
 'o',
 's',
 'd',
 'a',
 'd',
 'M',
 'o',
 'F',
 's',
 'S',
 'N',
 'd',
 'C',
 'd',
 'T',
 'A',
 'o',
 'S',
 't',
 'm',
 'u',
 'f',
 't',
 'c',
 'i',
 't',
 'c',
 'b',
 's',
 'f',
 '6',
 'a',
 'm',
 'c',
 'E',
 't',
 '0',
 '0',
 'a',
 'm',
 'E',
 'F',
 'K',
 'o',
 'd',
 'a',
 'e',
 't',
 'c',
 'o',
 'i',
 'M',
 'T',
 'u',
 's',
 'i',
 'w',
 'c',
 't',
 'a',
 'a',
 'n',
 's',
 'w',
 't',
 'g',
 'o',
 'p',
 'r',
 'i',
 'l',
 'w',
 'i',
 'A',
 's',
 'm',
 's',
 'n',
 't',
 'g',
 'f',
 'a',
 'a',
 'a',
 'S',
 'l',
 't',
 'l',
 'o',
 'c',
 'e',
 'a',
 'g',
 'f',
 'p',
 'a',
 'i',
 's',
 'i',
 'a',
 's']

In [33]:
#find the last letter of every word
last_letters = re.findall(r"(\w)\b", new_article1_longer)
last_letters

['N',
 's',
 'f',
 's',
 'e',
 'n',
 'd',
 'd',
 's',
 'f',
 's',
 'd',
 'o',
 'e',
 's',
 's',
 'h',
 'r',
 'c',
 's',
 'k',
 'l',
 'n',
 'r',
 'y',
 'n',
 'y',
 'e',
 's',
 't',
 'e',
 'r',
 'd',
 '0',
 'f',
 's',
 's',
 's',
 'e',
 's',
 'e',
 '4',
 'r',
 'e',
 'd',
 's',
 'm',
 's',
 'd',
 'e',
 't',
 'l',
 't',
 'r',
 't',
 's',
 'g',
 'f',
 'e',
 's',
 'd',
 'n',
 's',
 'f',
 'e',
 's',
 't',
 'l',
 's',
 's',
 'u',
 'c',
 'n',
 'r',
 'A',
 'e',
 'n',
 'n',
 'r',
 'c',
 's',
 'n',
 'e',
 'y',
 'n',
 'g',
 'm',
 '6',
 'a',
 'm',
 'l',
 'n',
 'e',
 '0',
 '0',
 'a',
 'm',
 'T',
 'y',
 'k',
 'n',
 's',
 'e',
 'd',
 'o',
 'y',
 'n',
 'o',
 'y',
 'e',
 'n',
 'd',
 't',
 's',
 'g',
 'e',
 'n',
 'r',
 's',
 'd',
 'h',
 'e',
 't',
 'r',
 'y',
 's',
 'n',
 'e',
 'h',
 'n',
 'r',
 'l',
 's',
 't',
 'g',
 'o',
 't',
 'r',
 'd',
 'd',
 's',
 'A',
 's',
 'e',
 'k',
 'f',
 'e',
 's',
 'd',
 's',
 'm',
 'c',
 's',
 't',
 'd',
 'n',
 'a',
 't']

In [34]:
html_source = "<html><body> <ul><li>johndoe@gmail.com</li><li>janedoe@gmail.com</li><li>Robin</li></ul></body></html>"
html_source

'<html><body> <ul><li>johndoe@gmail.com</li><li>janedoe@gmail.com</li><li>Robin</li></ul></body></html>'

In [35]:
list_item_values = re.findall("<li>(.*?)</li>", html_source)
list_item_values

['johndoe@gmail.com', 'janedoe@gmail.com', 'Robin']

#### ❓Exercise 1. Change the above regex to match only email from `html_source`.

In [36]:
emails = re.findall("TO_FILL_UP", html_source)
emails

[]

In [37]:
emails = re.findall("<li>(.*?@.*?)</li>", html_source)
emails

['johndoe@gmail.com', 'janedoe@gmail.com']

### `finditer(pattern, string, flags)`
if we need the match object from `findall()`, we can use `finditer() instead

In [38]:
#when using capturing group, findall will return the contents in the capturing group
from_tokens_iter = re.finditer("from (\w+)", new_article1_longer)
for t in from_tokens_iter:
    print(t)
    print(t.group()) #string that matches it
    print(t.group(1)) #first capturing group content

<re.Match object; span=(293, 306), match='from airports'>
from airports
airports
<re.Match object; span=(556, 562), match='from 6'>
from 6
6
<re.Match object; span=(937, 948), match='from public'>
from public
public


#### ❓Exercise 2. Write a regex to match all the time tokens in `new_article1_longer`.

In [39]:
new_article1_longer



In [40]:
#need to match: 
#00:00 a.m.
#6 a.m.

In [41]:
time_tokens = re.findall("\d+.*?[ap]\.m\.", new_article1_longer)
time_tokens

['6 a.m.', '00:00 a.m.']

In [42]:
time_tokens = re.findall("\d{1,2}:\d{2} [ap]\.m\.", new_article1_longer)
time_tokens

['00:00 a.m.']

In [43]:
time_tokens = re.findall("\d+.*?[ap]\.m\.", "6::0 a.m..")
time_tokens

['6::0 a.m.']

In [44]:
time_tokens = re.findall("(\d{1,2}(:\d{0,2})?\s*?[ap]\.m\.)", "7123123 a.m.")
time_tokens

[('23 a.m.', '')]

In [45]:
#non-caputuring group
time_tokens = re.findall("(\d{1,2}(?::\d{0,2})?\s*?[ap]\.m\.)", "00:00 a.m.")
time_tokens

['00:00 a.m.']

In [46]:
time_tokens = re.findall("TO_FILL_UP", new_article1_longer)
time_tokens

[]

In [47]:
time_tokens = re.findall("TO_FILL_UP", new_article1_longer)
time_tokens

[]

### `sub(pattern, repl, string, flags)`
returns a string obtained by replacing occurrence of the `pattern` in `string` by replacement `repl`

In [48]:
re.sub("[eE]urope|[eE]uropean|[fF]rance", "PLACE", new_article1_longer)



In [49]:
re.sub("[eE]uropean|[eE]urope|[fF]rance", "PLACE", new_article1_longer)



#### ❓Exercise 3. Build a Regex Tokenizer by hand.
*Hint: try use `re.split()` multiple times*

In [50]:
re.split("\s", "hello world")

['hello', 'world']

In [51]:
re.split("#", "hello#world")

['hello', 'world']

In [52]:
re.split("\s", new_article1_longer)

['',
 'CNN',
 '—',
 'Hundreds',
 'of',
 'flights',
 'have',
 'been',
 'canceled',
 'and',
 'thousands',
 'of',
 'passengers',
 'forced',
 'to',
 'change',
 'plans',
 'as',
 'French',
 'air',
 'traffic',
 'controllers',
 'took',
 'industrial',
 'action',
 'over',
 'pay',
 'on',
 'Friday.',
 '',
 "Europe's",
 'biggest',
 'airline',
 'Ryanair',
 'grounded',
 '420',
 'of',
 'its',
 'flights',
 'across',
 'Europe',
 'as',
 'the',
 '24-hour',
 'strike',
 'affected',
 'connections',
 'from',
 'airports',
 'around',
 'the',
 'continent.',
 'Rival',
 'budget',
 'carrier',
 'EasyJet',
 'was',
 'of',
 'severe',
 'delays',
 'and',
 'disruption.',
 '',
 'Members',
 'of',
 "France's",
 'Syndicat',
 'National',
 'des',
 'Contrôleurs',
 'du',
 'Trafic',
 'Aérien,',
 'or',
 'SNCTA,',
 'the',
 'main',
 'union',
 'for',
 'traffic',
 'controllers',
 'in',
 'the',
 'country,',
 'began',
 'striking',
 'from',
 '6',
 'a.m.',
 'central',
 'European',
 'time',
 '(00:00',
 'a.m.',
 'ET)',
 'Friday.',
 'Knock-on

In [53]:
re.split("#", "hello#world")

['hello', 'world']

In [54]:
re.split("\s", new_article1_longer)

['',
 'CNN',
 '—',
 'Hundreds',
 'of',
 'flights',
 'have',
 'been',
 'canceled',
 'and',
 'thousands',
 'of',
 'passengers',
 'forced',
 'to',
 'change',
 'plans',
 'as',
 'French',
 'air',
 'traffic',
 'controllers',
 'took',
 'industrial',
 'action',
 'over',
 'pay',
 'on',
 'Friday.',
 '',
 "Europe's",
 'biggest',
 'airline',
 'Ryanair',
 'grounded',
 '420',
 'of',
 'its',
 'flights',
 'across',
 'Europe',
 'as',
 'the',
 '24-hour',
 'strike',
 'affected',
 'connections',
 'from',
 'airports',
 'around',
 'the',
 'continent.',
 'Rival',
 'budget',
 'carrier',
 'EasyJet',
 'was',
 'of',
 'severe',
 'delays',
 'and',
 'disruption.',
 '',
 'Members',
 'of',
 "France's",
 'Syndicat',
 'National',
 'des',
 'Contrôleurs',
 'du',
 'Trafic',
 'Aérien,',
 'or',
 'SNCTA,',
 'the',
 'main',
 'union',
 'for',
 'traffic',
 'controllers',
 'in',
 'the',
 'country,',
 'began',
 'striking',
 'from',
 '6',
 'a.m.',
 'central',
 'European',
 'time',
 '(00:00',
 'a.m.',
 'ET)',
 'Friday.',
 'Knock-on

In [55]:
re.split("(,|!|\.)", "hello,world.a,b")

['hello', ',', 'world', '.', 'a', ',', 'b']

In [56]:
def regex_tokenize(s):
    #list to store the tokens
    final_tokens = []
    
    #TODO: first split by space
    tokens = re.split("\s", s)
    for t in tokens:
        #then split by these cases:
        #(
        #)
        #'s
        #,
        #.. (with 2 or more periods)
        #"
        tokens1 = re.split("(\(|\)|'s|,|\.{2,}|\")", t)
        
        for t1 in tokens1:
            if t1 != "":
                #check whether token ends with .
                #i.e. the last word of the sentence
                if re.search("\w\w+\.$", t1):
                    #case where this is the last word
                    final_tokens.append(t1[0:len(t1) -1])
                    final_tokens.append(".")
                else:
                    final_tokens.append(t1)
    
    return final_tokens

In [57]:
#it should look like this
regex_tokenize(new_article1_longer)

['CNN',
 '—',
 'Hundreds',
 'of',
 'flights',
 'have',
 'been',
 'canceled',
 'and',
 'thousands',
 'of',
 'passengers',
 'forced',
 'to',
 'change',
 'plans',
 'as',
 'French',
 'air',
 'traffic',
 'controllers',
 'took',
 'industrial',
 'action',
 'over',
 'pay',
 'on',
 'Friday',
 '.',
 'Europe',
 "'s",
 'biggest',
 'airline',
 'Ryanair',
 'grounded',
 '420',
 'of',
 'its',
 'flights',
 'across',
 'Europe',
 'as',
 'the',
 '24-hour',
 'strike',
 'affected',
 'connections',
 'from',
 'airports',
 'around',
 'the',
 'continent',
 '.',
 'Rival',
 'budget',
 'carrier',
 'EasyJet',
 'was',
 'of',
 'severe',
 'delays',
 'and',
 'disruption',
 '.',
 'Members',
 'of',
 'France',
 "'s",
 'Syndicat',
 'National',
 'des',
 'Contrôleurs',
 'du',
 'Trafic',
 'Aérien',
 ',',
 'or',
 'SNCTA',
 ',',
 'the',
 'main',
 'union',
 'for',
 'traffic',
 'controllers',
 'in',
 'the',
 'country',
 ',',
 'began',
 'striking',
 'from',
 '6',
 'a.m.',
 'central',
 'European',
 'time',
 '(',
 '00:00',
 'a.m.',
