### finding patterns of text without regual expressions

In [9]:
# phone number in a string
# xxx-xxx-xxxx valid number

def isPhoneNumber(text):
    if len(text) !=12:
        return False
    
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
        
    if text[3] != '-':
        return False
    
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    
    if text[7] != '-':
        return False
    
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
        
    return True

In [10]:
print('415-555-4242 is a phone number?') 
print(isPhoneNumber('415-555-4242'))
print('\n')
print('454-333-333 is a phone number?') 
print(isPhoneNumber('454-333-333'))

415-555-4242 is a phone number?
True


454-333-333 is a phone number?
False


In [11]:
# what if we had large text
message = 'Call me at 415-555-4242 tommoron. 415-444-2222 is my office'

for i in range(len(message)):
#     print(i, message[i])
    number = message[i:i+12]
#     print(number)
    
    if isPhoneNumber(number):
        print('Mob number found: ' + number)
print('All Ok!')

Mob number found: 415-555-4242
Mob number found: 415-444-2222
All Ok!


## finding patterns of text with regualar expressions

In [12]:
RegEx : 
    \d  : digit chars(1,2,3,4,5,6,7,8,9,0)
    {n} : \d{3}-\d{3}-\d{4}

SyntaxError: invalid syntax (<ipython-input-12-d137d6a28068>, line 1)

#### creating regex objects

In [13]:
import re

In [14]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# matching regex objects
mo = phoneNumRegex.search('My number is 415-234-4567.')

print('phone number found: '+ mo.group())

phone number found: 415-234-4567


In [15]:
aadharNum = 'aadhar format look like this -> 4444 3333 6666'

# regex objects
aadharRegex = re.compile(r'\d{4} \d{4} \d{4}')

# matching object
mo = aadharRegex.search(aadharNum)

print('search found.', mo.group())

search found. 4444 3333 6666


## more pattern matching with regex

### grouping with parantheses


In [16]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my number is 415-555-5555')

print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())
print(mo.groups())

areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-5555
415-555-5555
415-555-5555
('415', '555-5555')
415
555-5555


In [17]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my phone number is (415) 555-4442.')

print(mo.group(1))
print(mo.group(2))

(415)
555-4442


### matching multiple groups with pipes

    - | -> pipe

In [18]:
heroRegex = re.compile(r'Batman|Tina Fey')

mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())

mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())

Batman
Tina Fey


In [19]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')

mo = batRegex.search('Batmobile and Batman')

print(mo.group())
print(mo.group(1))

Batmobile
mobile


### optional matching with question marks
        - ? -> optional

In [20]:
batRegex = re.compile(r'Bat(wo)?man')

mo1 = batRegex.search('The adventure of Batman')

print(mo1.group())

mo2 = batRegex.search('The adventure of Batwoman')
print(mo2.group())

Batman
Batwoman


In [21]:
phoneNumRegex = re.compile(r'(\d{3}-)?\d{3}-\d{4}')
# phoneNumRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')

mo1 = phoneNumRegex.search('phone number : 111-555-3333')
print(mo1.group())

mo2 = phoneNumRegex.search('phone number : 444-2222')
print(mo2.group())

111-555-3333
444-2222


### matching zero or more with the star *
        - * -> zero or more

In [22]:
batRegex = re.compile(r'Bat(wo)*man')

mo1 = batRegex.search('The Adventure of Batman')

print(mo1.group())

mo2 = batRegex.search('The adventure of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The adventure of Batwowowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowowoman


### matching one or more with the plus
        - + -> one or more

In [23]:
batRegex = re.compile(r'Bat(wo)+man')

mo1 = batRegex.search('the adventure of Batwoman')
print(mo1.group())

mo2 = batRegex.search('the adventure of Batwowowowoman')
print(mo2.group())

# require atleat one 'wo'
mo3 = batRegex.search('the adventure of Batman')

print(mo3 == None)

Batwoman
Batwowowowoman
True


In [24]:
batRegex = re.compile(r'Bat(wo)\+man')

mo4 = batRegex.search('Batwo+man string')
print(mo4.group())

Batwo+man


### Matching Specific Repetitions with Curly Brackets
        - (Ha){3}
        - (Ha){3,5}
        - (Ha){,5}

In [32]:
haRegex = re.compile(r'(Ha){3}')

mo1 = haRegex.search('HaHaHa')

print(mo1.group())

mo2 = haRegex.search('Ha')

print(mo2 == None)

HaHaHa
True


In [38]:
haRegex = re.compile(r'(Ha){3,5}')

mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('HaHaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHa')
print(mo3.group())

HaHaHa
HaHaHaHa
HaHaHaHaHa


In [43]:
haRegex = re.compile(r'(Ha){3,}')

mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('HaHaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHaHaHa')
print(mo3.group())

HaHaHa
HaHaHaHa
HaHaHaHaHaHaHa


In [48]:
haRegex = re.compile(r'(Ha){,5}')

mo1 = haRegex.search('')
print(mo1.group())

mo2 = haRegex.search('HaHaHa')
print(mo2.group())

mo3 = haRegex.search('HaHaHaHaHaHa')
print(mo3.group())


HaHaHa
HaHaHaHaHa


### greedy and nongreedy matching

In [52]:
greedyRegex = re.compile(r'(Ha){3,5}')

mo1 = greedyRegex.search('HaHaHaHaHa')
print(mo1.group())



nongreedyRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


### findall() method
        - findall() method will return every matched in search string
        - findall() will not return match object but list of string
        - has not group in regex
        - if there are groups it will return in lists of tuples

In [67]:
# search

phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}')
mo1 = phoneNumRegex.search('work:415-234-3321,home:343-432-4433')
print(mo1.group())


phoneNumRegex1 = re.compile(r'(\d{3}-)?\d{3}-\d{4}')
mo2 = phoneNumRegex1.search('work:234-3321,home:343-432-4433')
print(mo2.group())

#-----------------------------------------------------------------
# findall

phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}')
print(phoneNumRegex.findall('work:415-234-3321,home:343-432-4433'))

415-234-3321
234-3321
['415-234-3321', '343-432-4433']


In [65]:
phoneNumRegex = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
print(phoneNumRegex.findall('work:415-234-3321,home:343-432-4433'))

[('415', '234', '3321'), ('343', '432', '4433')]


### character classes
        - \d - any numberic digit from 0-9
        - \D - any char that is not a numberic from 0-9
        - \w - any letter, numeric digit, or _char (think of matching "word" chars.
        - \W - any char that is not a letter, numeric digit, or _char
        - \s - any space , tah or new line
        - \S - any char that is not a space, tab or newline

In [1]:
import re

In [2]:
xmasRegex = re.compile(r'\d+\s\w+')
# '\d+\s\w+' = 0-9 followed by space followed by any letter or char or digit 

print(xmasRegex.findall('12 drummer, pipers, 10 lords, \
9 ladies, maids, 7 swan, 6 geese, 5 rings, 4 birds, \
3 hens, 2 doves, 1 partidge'))

['12 drummer', '10 lords', '9 ladies', '7 swan', '6 geese', '5 rings', '4 birds', '3 hens', '2 doves', '1 partidge']


### making your own character classes
        - negative character - ^

In [3]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
print(vowelRegex.findall('RoboCop eats baby food. Would YOU?'))

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'o', 'u', 'O', 'U']


In [4]:
match = re.compile(r'[a-zA-Z0-9]')
print(match.findall('Hello2'))

['H', 'e', 'l', 'l', 'o', '2']


In [19]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
print(consonantRegex.findall('RoboCop eats baby food. WOULD YOU?'))

['R', 'b', 'C', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', ' ', 'f', 'd', '.', ' ', 'W', 'L', 'D', ' ', 'Y', '?']


## the ^ and & sign
    - ^ at start of a regex to indicate a match occur at beginning of the searched text.
    
    - $ at end of a regex to indicate the string must end with his regex pattern.

In [5]:
beginsWithHello = re.compile(r'^Hello')
print(beginsWithHello.search('Hello World'))


print(beginsWithHello.search('He said Hello.') == None)



<re.Match object; span=(0, 5), match='Hello'>
True


In [6]:
endsWithNumber = re.compile(r'\d$')
print(endsWithNumber.search('Your Room is 594'))
print(endsWithNumber.search('You are twenty-one'))

<re.Match object; span=(15, 16), match='4'>
None


In [12]:
wholeStringIsNum = re.compile(r'^\d$')
# '^\d$' : means starts with a digit and ends
print(wholeStringIsNum.search('2'))

wholeStringIsNum = re.compile(r'^\d+$')
# '^\d+$' : means starts with one or more digit and ends with digits
print(wholeStringIsNum.search('233333'))

<re.Match object; span=(0, 1), match='2'>
<re.Match object; span=(0, 6), match='233333'>


In [15]:
print(wholeStringIsNum.search('1234xhdhd4646'))

print(wholeStringIsNum.search('123 4444'))

None
None


### The wildcard character
        - (.) will match any character except newline
        - (.) will match only one character

In [3]:
import re

In [4]:
atRegex = re.compile(r'.at')
print(atRegex.findall('the cat in the hat sat on the flat mate.'))

['cat', 'hat', 'sat', 'lat', 'mat']


### matching everything with .*

        - anything or everything will match

In [27]:
nameRegex = re.compile(r'FirstName: (.*) LastName: (.*)')
mo = nameRegex.findall('FirstName: K r i s h na LastName: K um ar')

print(mo)

# print(mo.group(1))
# print(mo.group(2))

[('K r i s h na', 'K um ar')]


In [29]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>
<To serve man> for dinner.>


### Matching Newline with the dot character
            - re.DOTALL has second argument to re.compile will match newline

In [30]:
noNewLineRegex = re.compile(r'.*')
mo = noNewLineRegex.search('first line. \nSecond line. \nThird Line.')
print(mo.group())

first line. 


In [31]:
newLineRegex = re.compile(r'.*', re.DOTALL)
mo = newLineRegex.search('first line. \nSecond line. \nThird line.')
print(mo.group())

first line. 
Second line. 
Third line.
