# Regular Expression in Python

## import Library

In [1]:
import re

The followinga are valid written phone numbers formats:
- +1-555-555-31221
- 1-555-555-3121
- 555-555-3121
- +1(555)-555-3121
- +15555553121 

In [2]:
my_phone_no = "555-555-3121"

In [3]:
pattern = r"\d"

In [4]:
re.findall(pattern, my_phone_no)
# list of digits

['5', '5', '5', '5', '5', '5', '3', '1', '2', '1']

In [5]:
pattern = r"\d+"
re.findall(pattern, my_phone_no)
# same digits together

['555', '555', '3121']

In [6]:
my_other_phone_no = "Hi there, my home number is 555-567-5309 and my cell number is +1-555-555-0007."
pattern = r"\d+"
re.findall(pattern, my_other_phone_no)

['555', '567', '5309', '1', '555', '555', '0007']

In [7]:
meeting_str = "Hey, give me a call at 8:30 on my cell +1-555-555-0007."
re.findall(pattern, meeting_str)

['8', '30', '1', '555', '555', '0007']

In [8]:
meeting_str = "Hey, give me a call at 8:30 on my cell +1-555-555-0007."
pattern = r"\+\d{1}-\d{3}-\d{3}-\d{4}"
re.findall(pattern, meeting_str)

['+1-555-555-0007']

In [9]:
re.findall(pattern, my_phone_no)

[]

In [10]:
# optional '+' or '-' in pattern using ?

pattern = r"\+?\d{1}-?\d{3}-?\d{3}-?\d{4}"
re.findall(pattern, my_other_phone_no)

['+1-555-555-0007']

In [11]:
re.findall(pattern, my_phone_no)

[]

In [12]:
# optional '(',  ')' using ?

pattern = r"\+?\d{1}-?\(?\d{3}\)?-?\d{3}-?\d{4}"
re.findall(pattern, my_other_phone_no)

['+1-555-555-0007']

In [13]:
re.findall(pattern, my_phone_no)

[]

In [14]:
phone_numbers = "My phone numbers are +1-555-555-31221, 1-555-555-3121,  555-555-3121, +1(555)-555-3121, +15555553121"
re.findall(pattern, phone_numbers)

['+1-555-555-3122', '1-555-555-3121', '+1(555)-555-3121', '+15555553121']

+  `\d` -> `[0-9]`

Both are representing digits in regular expression

In [15]:
pattern = r"\+?\d{1}-?\(?[0-9]{3}\)?-?[0-9]{3}-?[0-9]{4}"
re.findall(pattern, phone_numbers)

['+1-555-555-3122', '1-555-555-3121', '+1(555)-555-3121', '+15555553121']

## Groups

`<country-code>-<area-code>-<exchange-code>-<line-number>`

This represents:
1-212-555-5123
- 1 is the country code
- 212 is the area code
- 555 is the exchange code
- 5123 is the line number 

In [16]:
# () paranthesis represents a group in re

group_1 = "(\+?\d{1}-?)"
group_2 = "([0-9]{3}-?)"
group_3 = "([0-9]{3}-?)"
group_4 = "([0-9]{4})"

example = '+1-212-555-5123'
grouped_pattern = f"{group_1}{group_2}{group_3}{group_4}"
grouped_pattern = re.compile(grouped_pattern)

matched = grouped_pattern.match(example)
if matched:
    print('group', matched.group())
    print('groups', matched.groups())

print("example", grouped_pattern.findall(phone_numbers))

group +1-212-555-5123
groups ('+1-', '212-', '555-', '5123')
example [('+1-', '555-', '555-', '3122'), ('1-', '555-', '555-', '3121'), ('+1', '555', '555', '3121')]


In [17]:
matched[0]

'+1-212-555-5123'

In [18]:
matched[1], matched[2], matched[3], matched[4]

('+1-', '212-', '555-', '5123')

In [19]:
matched.group(0)

'+1-212-555-5123'

In [20]:
 matched.group(1), matched.group(2)

('+1-', '212-')

## Named Groups

In [21]:
group_1 = "\+?-?(?P<country_code>\d{1})-?"
group_2 = "\(?(?P<region_code>[0-9]{3})\)?-?"
group_3 = "(?P<exchange_code>[0-9]{3})-?"
group_4 = "(?P<line_number>[0-9]{4})"

example = '+1-(212)-555-5123'
grouped_pattern = f"{group_1}{group_2}{group_3}{group_4}"            # group1 can be optional 
grouped_pattern = re.compile(grouped_pattern)

matched = grouped_pattern.match(example)
if matched:
    print('group', matched.group())
    print('groups', matched.groups())

print("example", grouped_pattern.findall(phone_numbers))

group +1-(212)-555-5123
groups ('1', '212', '555', '5123')
example [('1', '555', '555', '3122'), ('1', '555', '555', '3121'), ('1', '555', '555', '3121'), ('1', '555', '555', '3121')]


In [22]:
matched.groupdict()

{'country_code': '1',
 'region_code': '212',
 'exchange_code': '555',
 'line_number': '5123'}

In [23]:
matched[1], matched['country_code']

('1', '1')

In [24]:
matched[2], matched['region_code']

('212', '212')

In [25]:
for m in grouped_pattern.finditer(phone_numbers):
    print(m.groupdict())

{'country_code': '1', 'region_code': '555', 'exchange_code': '555', 'line_number': '3122'}
{'country_code': '1', 'region_code': '555', 'exchange_code': '555', 'line_number': '3121'}
{'country_code': '1', 'region_code': '555', 'exchange_code': '555', 'line_number': '3121'}
{'country_code': '1', 'region_code': '555', 'exchange_code': '555', 'line_number': '3121'}


In [26]:
dataset = []
for m in grouped_pattern.finditer(phone_numbers):
    data = {**m.groupdict()}
    data['phone_number'] = m.group(0)
    dataset.append(data)
dataset

[{'country_code': '1',
  'region_code': '555',
  'exchange_code': '555',
  'line_number': '3122',
  'phone_number': '+1-555-555-3122'},
 {'country_code': '1',
  'region_code': '555',
  'exchange_code': '555',
  'line_number': '3121',
  'phone_number': '1-555-555-3121'},
 {'country_code': '1',
  'region_code': '555',
  'exchange_code': '555',
  'line_number': '3121',
  'phone_number': '+1(555)-555-3121'},
 {'country_code': '1',
  'region_code': '555',
  'exchange_code': '555',
  'line_number': '3121',
  'phone_number': '+15555553121'}]

In [27]:
import pandas as pd
df = pd.DataFrame(dataset)
df

Unnamed: 0,country_code,region_code,exchange_code,line_number,phone_number
0,1,555,555,3122,+1-555-555-3122
1,1,555,555,3121,1-555-555-3121
2,1,555,555,3121,+1(555)-555-3121
3,1,555,555,3121,+15555553121


## What about letters

In [28]:
my_text = "Hello World, I have score of 10/10. How cool is that?"

pattern = r"[a-z]"
re.findall(pattern, my_text)

['e',
 'l',
 'l',
 'o',
 'o',
 'r',
 'l',
 'd',
 'h',
 'a',
 'v',
 'e',
 's',
 'c',
 'o',
 'r',
 'e',
 'o',
 'f',
 'o',
 'w',
 'c',
 'o',
 'o',
 'l',
 'i',
 's',
 't',
 'h',
 'a',
 't']

In [29]:
pattern = r"[a-zA-Z]"
re.findall(pattern, my_text)

['H',
 'e',
 'l',
 'l',
 'o',
 'W',
 'o',
 'r',
 'l',
 'd',
 'I',
 'h',
 'a',
 'v',
 'e',
 's',
 'c',
 'o',
 'r',
 'e',
 'o',
 'f',
 'H',
 'o',
 'w',
 'c',
 'o',
 'o',
 'l',
 'i',
 's',
 't',
 'h',
 'a',
 't']

In [30]:
pattern = r"[a-zA-Z]+"
re.findall(pattern, my_text)

['Hello', 'World', 'I', 'have', 'score', 'of', 'How', 'cool', 'is', 'that']

In [31]:
pattern = r"\w+"
re.findall(pattern, my_text)

['Hello',
 'World',
 'I',
 'have',
 'score',
 'of',
 '10',
 '10',
 'How',
 'cool',
 'is',
 'that']

`\w` -> `[0-9a-zA-Z]`

Both are same.

In [32]:
pattern = r"[0-9a-zA-Z .]+"
re.findall(pattern, my_text)

['Hello World', ' I have score of 10', '10. How cool is that']

## Metacharacters

- `^` start of string
- `[^0-9]` this matches everything except `[0-9]` because of `^`
- `$` end of the string
- `+` if 1 or more happens
- `*` if 0 or more happens
- `?` makes the value before ? optional
- `|` or operator
- `\d` -> `[0-9]` -> digits
- `\D` -> `[^0-9]` -> not digits
- `\s` whitespace
- `\S` not whitespace
- `\w` -> `[0-9a-zA-Z]`
- `\W` -> `[^0-9a-zA-Z]`
