### finding patterns of text without regual expressions

In [9]:
# phone number in a string
# xxx-xxx-xxxx valid number

def isPhoneNumber(text):
    if len(text) !=12:
        return False
    
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
        
    if text[3] != '-':
        return False
    
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    
    if text[7] != '-':
        return False
    
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
        
    return True

In [10]:
print('415-555-4242 is a phone number?') 
print(isPhoneNumber('415-555-4242'))
print('\n')
print('454-333-333 is a phone number?') 
print(isPhoneNumber('454-333-333'))

415-555-4242 is a phone number?
True


454-333-333 is a phone number?
False


In [11]:
# what if we had large text
message = 'Call me at 415-555-4242 tommoron. 415-444-2222 is my office'

for i in range(len(message)):
#     print(i, message[i])
    number = message[i:i+12]
#     print(number)
    
    if isPhoneNumber(number):
        print('Mob number found: ' + number)
print('All Ok!')

Mob number found: 415-555-4242
Mob number found: 415-444-2222
All Ok!


## finding patterns of text with regualar expressions

In [12]:
RegEx : 
    \d  : digit chars(1,2,3,4,5,6,7,8,9,0)
    {n} : \d{3}-\d{3}-\d{4}

SyntaxError: invalid syntax (<ipython-input-12-d137d6a28068>, line 1)

#### creating regex objects

In [13]:
import re

In [14]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# matching regex objects
mo = phoneNumRegex.search('My number is 415-234-4567.')

print('phone number found: '+ mo.group())

phone number found: 415-234-4567


In [15]:
aadharNum = 'aadhar format look like this -> 4444 3333 6666'

# regex objects
aadharRegex = re.compile(r'\d{4} \d{4} \d{4}')

# matching object
mo = aadharRegex.search(aadharNum)

print('search found.', mo.group())

search found. 4444 3333 6666


## more pattern matching with regex

### grouping with parantheses


In [16]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my number is 415-555-5555')

print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())
print(mo.groups())

areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-5555
415-555-5555
415-555-5555
('415', '555-5555')
415
555-5555


In [17]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my phone number is (415) 555-4442.')

print(mo.group(1))
print(mo.group(2))

(415)
555-4442


### matching multiple groups with pipes

    - | -> pipe

In [18]:
heroRegex = re.compile(r'Batman|Tina Fey')

mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())

mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())

Batman
Tina Fey


In [19]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')

mo = batRegex.search('Batmobile and Batman')

print(mo.group())
print(mo.group(1))

Batmobile
mobile


### optional matching with question marks
        - ? -> optional

In [20]:
batRegex = re.compile(r'Bat(wo)?man')

mo1 = batRegex.search('The adventure of Batman')

print(mo1.group())

mo2 = batRegex.search('The adventure of Batwoman')
print(mo2.group())

Batman
Batwoman


In [21]:
phoneNumRegex = re.compile(r'(\d{3}-)?\d{3}-\d{4}')
# phoneNumRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')

mo1 = phoneNumRegex.search('phone number : 111-555-3333')
print(mo1.group())

mo2 = phoneNumRegex.search('phone number : 444-2222')
print(mo2.group())

111-555-3333
444-2222


### matching zero or more with the star *
        - * -> zero or more

In [22]:
batRegex = re.compile(r'Bat(wo)*man')

mo1 = batRegex.search('The Adventure of Batman')

print(mo1.group())

mo2 = batRegex.search('The adventure of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The adventure of Batwowowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowowoman


### matching one or more with the plus
        - + -> one or more

In [23]:
batRegex = re.compile(r'Bat(wo)+man')

mo1 = batRegex.search('the adventure of Batwoman')
print(mo1.group())

mo2 = batRegex.search('the adventure of Batwowowowoman')
print(mo2.group())

# require atleat one 'wo'
mo3 = batRegex.search('the adventure of Batman')

print(mo3 == None)

Batwoman
Batwowowowoman
True


In [24]:
batRegex = re.compile(r'Bat(wo)\+man')

mo4 = batRegex.search('Batwo+man string')
print(mo4.group())

Batwo+man


### Matching Specific Repetitions with Curly Brackets
        - (Ha){3}
        - (Ha){3,5}
        - (Ha){,5}

In [32]:
haRegex = re.compile(r'(Ha){3}')

mo1 = haRegex.search('HaHaHa')

print(mo1.group())

mo2 = haRegex.search('Ha')

print(mo2 == None)

HaHaHa
True


In [38]:
haRegex = re.compile(r'(Ha){3,5}')

mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('HaHaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHa')
print(mo3.group())

HaHaHa
HaHaHaHa
HaHaHaHaHa


In [43]:
haRegex = re.compile(r'(Ha){3,}')

mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('HaHaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHaHaHa')
print(mo3.group())

HaHaHa
HaHaHaHa
HaHaHaHaHaHaHa


In [48]:
haRegex = re.compile(r'(Ha){,5}')

mo1 = haRegex.search('')
print(mo1.group())

mo2 = haRegex.search('HaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHaHa')
print(mo3.group())


HaHaHa
HaHaHaHaHa
