# <span style="color:#FEC260">Regular Expressions in Python</span>

In [1]:
import re

In [2]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa Ha Hii Haa

MetaCharacters :
. ^ $ * + ? { } [ ] \ | ( )

Google.com

123-456-789
123.456.789
123*555*1234

877-500-1234
980-555-1234
930-234-3455

Mr. Oliver
Mr Smith
Ms Peter
Mrs. parker
Mr. Galactus
Mrs. U
'''

sentence = "This is a sample sentence to use for regex matching"

In [None]:
pattern = re.compile(r'abcd')

hits = pattern.finditer(text_to_search)
for hit in hits:
    print(hit) 

# span shows the beginning and the end of the matches

In [None]:
# we need to escape meta characters as they hold different meanings in regex
p2 = re.compile(r'\.')
m1 = p2.finditer(text_to_search)

for x in m1:
    print(x)

# typical use cases are URLs
p3 = re.compile(r'Google\.com')
m2 = p3.finditer(text_to_search)

for x in m2:
    print(x)

### Regular expression syntax

* `.` - any character except newline
* `\d` - digit (0-9)
* `\D` - not a digit
* `\w` - word character (a-z, A-Z, 0-9, _)
* `\W` - not a word character
* `\s` - whitespace (space, tab, newline)
* `\S` - not whitespace

**Anchors**
* `\b` - word boundary
* `\B` - not a word boundary
* `^` - beginning of a string
* `$` - end of a string

**Character Classes**
* `[]` - matches characters in brackets
* `[^ ]` - matches characters NOT in brackets
* `|` - either or
* `()` - group
* `[1-4]` - range of numbers (minimum, maximum)

**Quantifiers**
* `*` - 0 or more
* `+` - 1 or more
* `?` - 0 or 1
* `{3}` - exact number
* `{3,4}` - range of numbers (minimum, maximum)




In [None]:
# matching phone numbers
ph_num = re.compile(r'[89][78]\d[-.]')

valid_numbers = ph_num.finditer(text_to_search)

for num in valid_numbers:
    print(num)

In [8]:
# finding all the occurrences without upper case letters
p5 = re.compile(r'[^a-zA-Z\d]')

no_upper = p5.finditer(text_to_search)

for x in no_upper:
    print(x)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(65, 66), match='\n'>
<re.Match object; span=(66, 67), match='\n'>
<re.Match object; span=(69, 70), match=' '>
<re.Match object; span=(74, 75), match=' '>
<re.Match object; span=(77, 78), match=' '>
<re.Match object; span=(81, 82), match=' '>
<re.Match object; span=(85, 86), match='\n'>
<re.Match object; span=(86, 87), match='\n'>
<re.Match object; span=(101, 102), match=' '>
<re.Match object; span=(102, 103), match=':'>
<re.Match object; span=(103, 104), match='\n'>
<re.Match object; span=(104, 105), match='.'>
<re.Match object; span=(105, 106), match=' '>
<re.Match object; span=(106, 107), match='^'>
<re.Match object; span=(107, 108), match=' '>
<re.Match object; span=(108, 109), match='$'>
<re.Match object; span=(109, 110), match=' '>
<re.Match object; span=(110, 111), match='*'>
<re.Match object; span=(111, 112), match=' '>
<re.

In [14]:
string2 = """
pun
bun
one
won
"""

p6 = re.compile(r'[^p]un')

something_weird = p6.finditer(string2)

for weird in something_weird:
    print(weird)

<re.Match object; span=(5, 8), match='bun'>


In [17]:
# using quantifiers
ph_num = re.compile(r'\d{3}.\d{3}.\d{4}')

valid_numbers = ph_num.finditer(text_to_search)

for num in valid_numbers:
    print(num)

<re.Match object; span=(169, 181), match='123*555*1234'>
<re.Match object; span=(183, 195), match='877-500-1234'>
<re.Match object; span=(196, 208), match='980-555-1234'>
<re.Match object; span=(209, 221), match='930-234-3455'>


In [6]:
# finding all with the `MR.` prefix
pattern = re.compile(r'(Mr|Mrs|Ms)\.?\s[A-Z]\w*')

result = pattern.finditer(text_to_search)

for r in result:
    print(r)

<re.Match object; span=(223, 233), match='Mr. Oliver'>
<re.Match object; span=(234, 242), match='Mr Smith'>
<re.Match object; span=(243, 251), match='Ms Peter'>
<re.Match object; span=(264, 276), match='Mr. Galactus'>
<re.Match object; span=(277, 283), match='Mrs. U'>


**Questions**

1. **Replace substrings in a string** - Assume that a mailing list contains names that sometimes include a title (Mr., Mrs., Miss, or Ms.) along with a first and last name. Suppose you don't want to include the titles when you generate envelope labels from the list. Write a program that uses a regular expression to remove the titles from names. The program should ask the user to enter a name in the format described previously and then display the corresponding name without the title.

2. **Identify duplicated words** - Accidentally duplicating words is a common error that writers make. Use a regular expression to identify duplicated words in a string. For example, if the string is "`This is is a test string`", then the program should print out the word is.

- Let us build a simple e-mail validator

In [None]:
pattern = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")
email = "a@a.a"

validator = pattern.search(email)

if validator:
    print("success")
else:
    print("Enter a valid email..")