## Finding patterns of text without regular expression

In [11]:
def isPhoneNumber(text):
    if len(text) != 12:  # Is it 12 digits?
        return False
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
    return True

In [15]:
print('Is 415-666-4242 a phone number?')
print(isPhoneNumber('415-666-4243'))

Is 415-666-4242 a phone number?
True


In [20]:
message = 'Call me at 702-222-3312 tomorrow, 702-333-2323 is my office bruh.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print(f"phone number is {chunk}")
print("LIT")

phone number is 702-222-3312
phone number is 702-333-2323
LIT


## Use regex

In [24]:
import re

In [32]:
phoneNumMatch = re.compile(r'\d{3}-\d{3}-\d{4}')
mo = phoneNumMatch.search('My number is: 702-777-7777')
print('Phone number fund is: ' + mo.group())

Phone number fund is: 702-777-7777


#### Grouping and parentheses

In [42]:
phoneNumRex = re.compile(r'(\d{3})-(\d{3}-\d{4})')
mo = phoneNumRex.search('My number is: 702-777-7777')
areaCode, mainNum = mo.groups()
mainNum

'777-7777'

#### Group Parentheses

In [3]:
import re
phoneNumParen = re.compile(r'(\(\d{3}\)) (\d{3}-\d{4})')
mo = phoneNumParen.search('My number is: (702) 777-7777')
print(f'the number has been found is: {mo.group()}')

the number has been found is: (702) 777-7777


#### pipe |

In [4]:
nameRexgex = re.compile(r'David | Tina Fey')
mo = nameRexgex.search('Tina Fey and David lol')
mo.group()

'David '

In [8]:
nameRegex1 = re.compile(r'Bat(man|mobile|lol|haha|eww)')
mo = nameRegex1.search('Ich libBateww')
mo.group()

'Bateww'

#### Optional question mark ()? regardless it's there in parenthesis or not

In [57]:
batRegex = re.compile(r'bat(wo)?man')
mo = batRegex.search('batwoman')
print(f'the text has been found is: {mo.group()}')

the text has been found is: batwoman


##### Another example

In [59]:
phoneRegex = re.compile(r'(\d{3}-)?\d{3}-\d{4}')
mo = phoneRegex.search('My number is 702-417-2422')
print(f'the text has been found is: {mo.group()}')

the text has been found is: 417-2422


#### Regardless it's in ()

In [12]:
phoneRegex = re.compile(r'(\d{3}-)*\d{3}-\d{4}')
mo = phoneRegex.search('My number is 702-417-2422')
print(f'the text has been found is: {mo.group()}')

the text has been found is: 702-417-2422


 ## "*" can replace where "?" was. It matches zero or more

In [67]:
batRegex = re.compile(r'bat(wo)*man')
mo = batRegex.search('batwoman')
print(f'the text has been found is: {mo.group()}')

the text has been found is: batwoman


#### plus sign + means match ONE or more.

In [70]:
batRegex = re.compile(r'bat(wo)+man')
mo = batRegex.search('batwowowoman')
print(f'the text has been found is: {mo.group()}')

the text has been found is: batwowowoman


#### Matching spefici repetitions | Greedy and not greedy

In [72]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo = greedyHaRegex.search("HaHaHaHaHa")
mo.group()

'HaHaHaHaHa'

In [75]:
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')  # ? declearing a non-greedy match or flagging an optional group, they are unrelated
mo = nongreedyHaRegex.search("HaHaHaHaHa")
mo.group()

'HaHaHa'

## Findall() finds all result with matching format

phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4}')
mo = phoneRegex.findall("My number is 702-417-2422, his is 626-333-8888")
mo

#### Character Class

In [80]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('Robo hjkhgjai ieuushe')

['o', 'o', 'a', 'i', 'i', 'e', 'u', 'u', 'e']

#### Negative character class matches char that are not in the char class

In [2]:
import re
vowelRegex1 = re.compile(r'[^aeiouAEIOU]')
vowelRegex1.findall('Robo Golf is some good shiet')

['R',
 'b',
 ' ',
 'G',
 'l',
 'f',
 ' ',
 's',
 ' ',
 's',
 'm',
 ' ',
 'g',
 'd',
 ' ',
 's',
 'h',
 't']

### Caret sign and Dollar sign

##### Caret in inside of [^aeiou] indicate "anything except aeiou "

##### But if you use caret sign OUTSIDE, meaning it must start with such chars

In [3]:
beginWithHello = re.compile(r'^hello')
beginWithHello.search("hello there")

<re.Match object; span=(0, 5), match='hello'>

##### $ at the end indicate regex must end with it

##### ^ & $ indicate entire strings must match regex

In [10]:
endWithNumber = re.compile(r'^\d$')
endWithNumber.search("4") == None

False

##### r'^\d+$' indicates string matches strings that both begin and end with one or more numeric characters

In [12]:
endWithNumber = re.compile(r'^\d+$')  # can be one or more numbers
endWithNumber.search("fdsf1") == None

True

### Wildcar character

##### . or dot, is the wildcard expression, it doesn't match for a new line tho

In [14]:
atRegex = re.compile(r'.at')
atRegex.findall('rat bat sss ats boom mat flat')

['rat', 'bat', ' at', 'mat', 'lat']

##### it matches one character.  To match dot, add backslash

##### .* means anything. * itself means zero or more preceding character

In [32]:
import re
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')  # remember (.*) and before must EXACTLY match the strings.
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(2)

'Sweigart'

In [40]:
nongreedyRegex = re.compile(r'<.*?>')   # without ?, then it will return the entire thing
mo = nongreedyRegex.search('<To serve man> for dinner>')
mo.group()

'<To serve man>'

### Matching newline

In [48]:
newlineRegex = re.compile('.*', re.DOTALL)

In [49]:
newlineRegex.search('Serfdfsdf, fdfasfadfadfd\nfdfasffdaf\nwiueonlsm').group()

'Serfdfsdf, fdfasfadfadfd\nfdfasffdaf\nwiueonlsm'

# Project

##### phone number

In [104]:
import re
phoneRegex = re.compile(r'''(
                        (\d{3}|\(\d{3}\))?
                        (\s|-|\.)?
                        (\d{3})
                        (\s|-|\.)
                        (\d{4})
                        (\s*(ext|x|ext.)\s*(\d{2,5}))?
                        )''', re.VERBOSE)
                        
                        

In [105]:
phoneRegex.search('My number is: (702) 777-7777').group(8)

In [106]:
emailRegex = re.compile(r'''([a-zA-Z0-9._%+-]+
                        @
                        [a-zA-Z0-9.-]+
                        (\.[a-zA-Z]{2,4})
                        )''',re.VERBOSE)

In [107]:
emailRegex.search('My number is: sdhskjd1234@hotmail.com').group()

'sdhskjd1234@hotmail.com'

In [108]:
pip install pyperclip

Note: you may need to restart the kernel to use updated packages.


In [109]:
import pyperclip
text = str(pyperclip.paste())

In [112]:
matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1],groups[3],groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])

In [113]:
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard: ')
    print('\n'.join(matches))
else:
    print('No phone numbers or email found')

Copied to clipboard: 
800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
conferences@nostarch.com
info@nostarch.com
