# Regular Expression

[Online tester](https://regexr.com/)

### The Regular Expression Module

In [None]:
import re

[re — Regular expression operations](https://docs.python.org/3/library/re.html)

In [None]:
print('hello\b world')

In [None]:
print('hello\\b world')

In [None]:
print(r'hello\b world')

<table class="docutils align-default">
<colgroup>
<col style="width: 28%">
<col style="width: 72%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Method/Attribute</p></th>
<th class="head"><p>Purpose</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">match()</span></code></p></td>
<td><p>Determine if the RE matches at the beginning
of the string.</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">search()</span></code></p></td>
<td><p>Scan through a string, looking for any
location where this RE matches.</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">findall()</span></code></p></td>
<td><p>Find all substrings where the RE matches, and
returns them as a list.</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">finditer()</span></code></p></td>
<td><p>Find all substrings where the RE matches, and
returns them as an <a class="reference internal" href="../glossary.html#term-iterator"><span class="xref std std-term">iterator</span></a>.</p></td>
</tr>
</tbody>
</table>

- `re.match(pattern, string, flags=0)`

In [None]:
print(re.match(r'dog', 'today is dog day'))

In [None]:
match = re.match(r'dog', 'dog day is today')
print(match)

<table class="docutils align-default">
<colgroup>
<col style="width: 29%">
<col style="width: 71%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Method/Attribute</p></th>
<th class="head"><p>Purpose</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">group()</span></code></p></td>
<td><p>Return the string matched by the RE</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">start()</span></code></p></td>
<td><p>Return the starting position of the match</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">end()</span></code></p></td>
<td><p>Return the ending position of the match</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">span()</span></code></p></td>
<td><p>Return a tuple containing the (start, end)
positions  of the match</p></td>
</tr>
</tbody>
</table>

In [None]:
match.group()

In [None]:
match.start()

In [None]:
match.end()

In [None]:
match.span()

- `re.search(pattern, string, flags=0)`

In [None]:
re.search(r"and", "hand")

In [None]:
m = re.search(r"and", "hand")
if m:
    print('Match found: ', m.group())
else:
    print('No match')

- `re.findall(r'regex', string)`

In [None]:
re.findall(r"movies", "Love movies! I had fun yesterday going to the movies")

- `re.finditer(pattern, string, flags=0)`

In [None]:
iterator= re.finditer(r"movies", "Love movies! I had fun yesterday going to the movies")

In [None]:
iterator

In [None]:
for match in iterator:
    print(match.span())

- `re.split(r'regex', string)`

In [None]:
re.split(r"!", "Nice Place to eat! I'll come back! Excellent meat!")

- `re.sub(r'regex', new, string)`

In [None]:
re.sub(r"yellow", "nice", "I have a yellow car and a yellow house in a yellow neighborhood")

- `re.fullmatch(pattern, string, flags=0)`

### Character Sets

In [None]:
string_list = ["Julie's favorite color is Blue.",
               "Keli's favorite color is Green.",
               "Craig's favorite colors are blue and red."]

In [3]:
blue_mentions = 0
pattern = r"[Bb]lue"

for s in string_list:
    if re.search(pattern, s):
        blue_mentions += 1

print(blue_mentions)

NameError: name 'string_list' is not defined

In [7]:
import pandas as pd 
import re

hn = pd.read_csv('data/hacker_news.csv')
titles = hn["title"].tolist()

python_mentions = 0
pattern = r"[Pp]ython"

for t in titles:
    if re.search(pattern, t):
        python_mentions += 1

In [None]:
python_mentions

### Using Regular Expressions to Select Data

In [None]:
python_titles = []
for t in titles:
    if re.search(pattern, t):
        python_titles.append(t)

In [None]:
python_titles[:10]

### Character Classes

    [video] Google Self-Driving SUV Sideswipes Bus
    New Directions in Cryptography by Diffie and Hellman (1976) [pdf]
    Wallace and Gromit  The Great Train Chase (1993) [video]

<img alt="character classes one" src="https://s3.amazonaws.com/dq-content/354/character_classes_v2_1.svg">

<img alt="character classes two" src="https://s3.amazonaws.com/dq-content/354/character_classes_v2_2.svg">

<img alt="negative character classes" src="https://s3.amazonaws.com/dq-content/354/negative_character_classes.svg">

#### Vaja1: Are they bots?



In [None]:
with open('data/short_tweets.csv') as f:
    sentiment_analysis = f.read()

In [None]:
# Write the regex
regex = r"@robot\d\W"

# Find all matches of regex
print(re.findall(regex, sentiment_analysis))

#### Vaja2: Find the numbers

In [None]:
# Write a regex to obtain user mentions
print(re.findall(r"User_mentions:\d", sentiment_analysis))

In [None]:
# Write a regex to obtain number of likes
print(re.findall(r"likes:\s\d", sentiment_analysis))

In [None]:
# Write a regex to obtain number of retweets
print(re.findall(r"number\sof\sretweets:\s\d", sentiment_analysis))

#### Vaja3: Match and split

In [None]:
# Write a regex to match pattern separating sentences
regex_sentence = r"\W\dbreak\W"

#print(re.findall(regex_sentence, sentiment_analysis))

# Replace the regex_sentence with a space
sentiment_sub = re.sub(regex_sentence, ' ', sentiment_analysis)
#print(re.findall(regex_sentence, sentiment_sub))

In [None]:
# Write a regex to match pattern separating words
regex_words = r"\Wnew\w"

# Replace the regex_words and print the result
sentiment_final = re.sub(regex_words, ' ', sentiment_sub)

### Repetitions / Quantifiers

In [None]:
import re
password = "password1234"

In [None]:
re.search(r"\w\w\w\w\w\w\w\w\d\d\d\d", password) # vidimo da to ni najlepše

<img alt="quantifiers" src="https://s3.amazonaws.com/dq-content/354/quantifiers_numeric.svg">

<img alt="quantifiers" src="https://s3.amazonaws.com/dq-content/354/quantifiers_other.svg">

- `Once or more: +`

In [2]:
import re
text = "Date of start: 4-3. Date of registration: 10-04."

In [3]:
re.findall(r"\d+-\d+", text)

['4-3', '10-04']

- `Zero times or more: *`

In [4]:
my_string = "The concert was amazing! @ameli!a @joh&&n @mary90"
re.findall(r"@\w+\W*\w+", my_string)

['@ameli!a', '@joh&&n', '@mary90']

- `Zero times or once: ?`

In [5]:
text = "The color of this image is amazing. However, the colour blue could be brighter."
re.findall(r"colou?r", text)

['color', 'colour']

- `n times at least, m times at most : {n, m}`

In [6]:
phone_number = "John: 1-966-847-3131 Michelle: 54-908-42-42424"

In [7]:
re.findall(r"\d{1,2}-\d{3}-\d{2,3}-\d{4,}", phone_number)

['1-966-847-3131', '54-908-42-42424']

> Immediately to the left `r"apple+` : + applies to e and not to apple

#### Vaja 4: Everything clean

In [None]:
sentiment_analysis = ['0,1467962897,Mon Apr 06 23:01:04 PDT 2009,NO_QUERY,aleskywalker,@nick_carter Come to the chat  just 15 minutes  please? http://fanclub.backstreetboys.com/chat.php',
'0,1467962938,Mon Apr 06 23:01:04 PDT 2009,NO_QUERY,jess___x,Boredd. Colddd @blueKnight39 Internet keeps stuffing up. Save me! https://www.tellyourstory.com',
'0,1467963418,Mon Apr 06 23:01:14 PDT 2009,NO_QUERY,Zimily,"I had a horrible nightmare last night @anitaLopez98 @MyredHat31 which affected my sleep, now I\'m really tired"',
'0,1467963477,Mon Apr 06 23:01:15 PDT 2009,NO_QUERY,Augustina22,"im lonely  keep me company @YourBestCompany! @foxRadio https://radio.foxnews.com 22 female, new york"',
'0,1467963715,Mon Apr 06 23:01:18 PDT 2009,NO_QUERY,missmadison,@Born_4_Broadway Lost  and it was St. Ignacius Prepatory School. Haha.']

In [None]:
# Import re module
import re

for tweet in sentiment_analysis:
    # Write regex to match http links and print out result
    print(re.findall(r"http\S+", tweet))

    # Write regex to match user mentions and print out result
    print(re.findall(r"@\w+", tweet))

#### Vaja 5: Some time ago

In [None]:
sentiment_analysis = ['I would like to apologize for the repeated Video Games Live related tweets. 32 minutes ago', '@zaydia but i cant figure out how to get there / back / pay for a hotel 1st May 2019', 'FML: So much for seniority, bc of technological ineptness 23rd June 2018 17:54']

In [None]:
# Complete the for loop with a regex to find dates, 27 minutes ago or 4 hours ago
for date in sentiment_analysis:
    print(re.findall(r"\d{1,2}\s\w+\s\w+", date))

In [None]:
# Complete the for loop with a regex to find dates, 23rd june 2018
for date in sentiment_analysis:
    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}", date))

In [None]:
# Complete the for loop with a regex to find dates, 1st september 2019 17:25
for date in sentiment_analysis:
    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}\s\d{1,2}:\d{2}", date))

#### Vaja 6: Getting tokens

In [None]:
sentiment_analysis = 'ITS NOT ENOUGH TO SAY THAT IMISS U #MissYou #SoMuch #Friendship #Forever'

In [None]:
# Write a regex matching the hashtag pattern
regex = r"#\w+"

In [None]:
# Replace the regex by an empty string
no_hashtag = re.sub(regex, "", sentiment_analysis)

In [None]:
no_hashtag

In [None]:
# Get tokens by splitting text
print(re.split(r"\s+", no_hashtag))

### Regex metacharacters

- `Match any character (except newline): .`

In [None]:
my_links = "Just check out this link: www.amazingpics.com. It has amazing photos!"

In [None]:
re.findall(r"www com", my_links)

<img alt="positional anchors" src="https://s3.amazonaws.com/dq-content/354/positional_anchors.svg">

In [None]:
re.findall(r"www.+com", my_links)

- `Start ofthe string: ^`

In [None]:
my_string = "the 80s music was much better that the 90s"

In [None]:
re.findall(r"the\s\d+s", my_string)

In [None]:
re.findall(r"^the\s\d+s", my_string)

- `End ofthe string: $`

In [None]:
my_string = "the 80s music hits were much better that the 90s"

In [None]:
re.findall(r"the\s\d+s$", my_string)

- `Escape special characters: \`

In [None]:
my_string = "I love the music of Mr.Go. However, the sound was too loud."

In [None]:
print(re.split(r".\s", my_string))

In [None]:
print(re.split(r"\.\s", my_string))

- `OR operator: Set of characters: [ ]`
- `OR operator: Character: |`


In [None]:
my_string = "Elephants are the world's largest land animal! I would love to see an elephant one day"

In [None]:
re.findall(r"Elephant|elephant", my_string)

In [None]:
my_string = "Yesterday I spent my afternoon with my friends: MaryJohn2 Clary3"

In [None]:
re.findall(r"[a-zA-Z]+\d", my_string)

In [None]:
my_string = "My&name&is#John Smith. I%live$in#London."

In [None]:
re.sub(r"[#$%&]", " ", my_string)

- `Set of characters: [ ], ^ transforms the expression to negative`

In [None]:
my_links = "Bad website: www.99.com. Favorite site: www.hola.com"
re.findall(r"www[^0-9]+com", my_links)

#### Vaja 7: Finding files

In [None]:
sentiment_analysis = ['AIshadowhunters.txt aaaaand back to my literature review. At least i have a friendly cup of coffee to keep me company',
 "ouMYTAXES.txt I am worried that I won't get my $900 even though I paid tax last year"]

In [None]:
# Write a regex to match text file name
regex = r"^[aeiouAEIOU]{2,3}.+txt"

for text in sentiment_analysis:
    # Find all matches of the regex
    print(re.findall(regex, text))

    # Replace all matches with empty string
    print(re.sub(regex, '', text))

#### Vaja 8: Give me your email

In [None]:
emails = ['n.john.smith@gmail.com', '87victory@hotmail.com', '!#mary-=@msca.net']

In [None]:
# Write a regex to match a valid email address
regex = r"[A-Za-z-0-9!#%&*$~.]+@\w+\.com"

for example in emails:
    # Match the regex to the string
    if re.match(regex, example):
        # Complete the format method to print out the result
        print(f"The email {example} is a valid email")
    else:
        print(f"The email {example} is invalid")   

#### Vaja 9: Invalid password

In [None]:
passwords = ['Apple34!rose', 'My87hou#4$', 'abc123']

In [None]:
# Write a regex to match a valid password
regex = r"[a-zA-Z0-9!#%&*$~\.]{8,20}"

for example in passwords:
    # Scan the strings to find a match
    if re.search(regex, example):
        # Complete the format method to print out the result
        print(f"The password {example} is a valid password")
    else:
        print(f"The password {example} is invalid") 

### Using Flags to Modify Regex Patterns

In [None]:
import re
email_tests = ['email', 'Email', 'e Mail', 'e mail', 'E-mail', 'e-mail', 'eMail', 'E-Mail', 'EMAIL']

In [None]:
pattern = r"e[\-\s]?mail"

In [None]:
# without flag
for email in email_tests:
    print(re.match(pattern, email))

In [None]:
for email in email_tests:
    print(re.match(pattern, email, flags=re.I))

### Capturing groups


In [None]:
text = "Clary has 2 friends who she spends a lot time with. Susan has 3 brothers while John has 4 sisters."

In [None]:
re.findall('[A-Za-z]+\s\w+\s\d+\s\w+', text)

In [None]:
re.findall('([A-Za-z]+)\s\w+\s\d+\s\w+', text)

In [None]:
re.findall('([A-Za-z]+)\s\w+\s(\d+)\s(\w+)', text)

In [None]:
pets = re.findall('([A-Za-z]+)\s\w+\s(\d+)\s(\w+)', "Clary has 2 dogs but John has 3 cats")
pets

In [None]:
pets[0][0]

In [None]:
re.search(r"(\d[A-Za-z])+", "My user name is 3e4r5fg")

In [None]:
my_string = "My lucky numbers are 8755 and 33"
re.findall(r"(\d)+", my_string)

In [None]:
re.findall(r"(\d+)", my_string)

#### Vaja 13: Try another name

In [None]:
sentiment_analysis = ['Just got ur newsletter, those fares really are unbelievable. Write to statravelAU@gmail.com or statravelpo@hotmail.com. They have amazing prices', 'I should have paid more attention when we covered photoshop in my webpage design class in undergrad. Contact me Hollywoodheat34@msn.net.', 'hey missed ya at the meeting. Read your email! msdrama098@hotmail.com']

In [None]:
# Write a regex that matches email
regex_email = r"([A-Za-z0-9]+)@\S+"

for tweet in sentiment_analysis:
    # Find all matches of regex in each tweet
    email_matched = re.findall(regex_email, tweet)

    # Complete the format method to print the results
    print(f"Lists of users found in this tweet: {email_matched}")

#### Vaja 14: Flying home

In [None]:
text = "Subject: You are now ready to fly. Here you have your boarding pass IB3723 AMS-MAD 06OCT"

In [None]:
# Import re
import re

# Write regex to capture information of the flight
regex = r"([A-Z]{2})(\d{4})\s([A-Z]{3})-([A-Z]{3})\s(\d{2}[A-Z]{3})"

# Find all matches of the flight information
flight_matches = re.findall(regex, text)

print(flight_matches)

#Print the matches
print(f"Airline: {flight_matches[0][0]} Flight number: {flight_matches[0][1]}")
print(f"Departure: {flight_matches[0][2]} Destination: {flight_matches[0][3]}")
print(f"Date: {flight_matches[0][4]}")

#### Vaja 15: Extracting URL Parts Using Multiple Capture Groups

In [None]:
test_urls = [
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param'
]

In [None]:
pattern = r"(.+)://([\w\.]+)/?(.*)"
results = []


for url in test_urls:
    comp = re.findall(pattern, url)
    results.append(comp)
    print(comp)

In [None]:
results

In [None]:
from collections import Counter

In [None]:
# count protocol
Counter([result[0][0].lower() for result in results]).most_common()

### Numbered and named groups 

- `Numbered groups`

In [None]:
text = "Python 3.0 was released on 12-03-2008."

In [None]:
information = re.search('(\d{1,2})-(\d{2})-(\d{4})', text)

In [None]:
information

In [None]:
information.group(3)

In [None]:
information.group(1)

In [None]:
information.group(0)

- `Named groups`

    (?P<name>regex)

In [None]:
text = "Austin, 78701"
cities = re.search(r"(?P<city>[A-Za-z]+).*?(?P<zipcode>\d{5})", text)
cities.group("city")

In [None]:
cities.group("zipcode")

#### Vaja 16: Parsing PDF files

In [None]:
contract = 'Provider will invoice Client for Services performed within 30 days of performance.  Client will pay Provider as set forth in each Statement of Work within 30 days of receipt and acceptance of such invoice. It is understood that payments to Provider for services rendered shall be made in full as agreed, without any deductions for taxes of any kind whatsoever, in conformity with Provider’s status as an independent contractor. Signed on 03/25/2001.'

In [None]:
# Write regex and scan contract to capture the dates described
regex_dates = r"Signed\son\s(\d{2})/(\d{2})/(\d{4})"
dates = re.search(regex_dates, contract)

In [None]:
# Assign to each key the corresponding match
signature = {
    "day": dates.group(2),
    "month": dates.group(1),
    "year": dates.group(3)
}

In [None]:
# Complete the format method to print-out
print(f"Our first contract is dated back to {signature['year']}. Particularly, the day {signature['day']} of the month {signature['month']}.")

#### Vaja 17: Extracting URL Parts Using Named Groups

In [None]:
pattern = r"(?P<protocol>.+)://(?P<domain>[\w\.]+)/?(?P<path>.*)"

for url in test_urls:
    comp = re.search(pattern, url)
    print(comp.group('protocol'))
    print(comp.group('domain'))
    print(comp.group('path'))
    print('------------------')