# Regular Expression

#### Used for patter matching in text

In [1]:
import re

In [2]:
patterns = ['term1','term2']

In [3]:
text = 'This is a text containing term1,and not the other term'

In [7]:
re.search('hello','hello world')

<re.Match object; span=(0, 5), match='hello'>

In [15]:
for pattern in patterns:
    print('Searching for %s in "%s"' %(pattern,text))
    
    if re.search(pattern,text):
        print('\nPattern found\n\n')
    else:
        print('\nNo pattern found\n\n')

Searching for term1 in "This is a text containing term1,and not the other term"

Pattern found


Searching for term2 in "This is a text containing term1,and not the other term"

No pattern found




In [16]:
match = re.search(patterns[0],text)

In [17]:
type(match)

re.Match

In [20]:
match.start()

26

In [21]:
match.end()

31

In [22]:
split_term = '@'

phrase = "Hello what is your name and email is it hello@email.com or xyz"

In [23]:
re.split(split_term,phrase)

['Hello what is your name and email is it hello', 'email.com or xyz']

In [35]:
def multi_re_find(patterns,phrases):
    '''
    Takes in a list of regex patterns
    prints a list of all matches
    '''
    for pattern in patterns:
        print('Searching the phrase using recheck %r' %pattern)
        print(re.findall(pattern,phrase))

In [25]:
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = [ 'sd*',     # s followed by zero or more d's
                'sd+',          # s followed by one or more d's
                'sd?',          # s followed by zero or one d's
                'sd{3}',        # s followed by three d's
                'sd{2,3}',      # s followed by two to three d's
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using recheck 'sd*'
['s', 's']
Searching the phrase using recheck 'sd+'
[]
Searching the phrase using recheck 'sd?'
['s', 's']
Searching the phrase using recheck 'sd{3}'
[]
Searching the phrase using recheck 'sd{2,3}'
[]


In [26]:
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = ['[sd]',    # either s or d
                's[sd]+']   # s followed by one or more s or d

multi_re_find(test_patterns,test_phrase)

Searching the phrase using recheck '[sd]'
['s', 'd', 's']
Searching the phrase using recheck 's[sd]+'
[]


In [27]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [28]:
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [30]:
test_phrase = 'This is an example sentence. Lets see if we can find some letters.'

test_patterns=['[a-z]+',      # sequences of lower case letters
               '[A-Z]+',      # sequences of upper case letters
               '[a-zA-Z]+',   # sequences of lower or upper case letters
               '[A-Z][a-z]+'] # one upper case letter followed by lower case letters
                
multi_re_find(test_patterns,test_phrase)

Searching the phrase using recheck '[a-z]+'
['ello', 'what', 'is', 'your', 'name', 'and', 'email', 'is', 'it', 'hello', 'email', 'com', 'or', 'xyz']
Searching the phrase using recheck '[A-Z]+'
['H']
Searching the phrase using recheck '[a-zA-Z]+'
['Hello', 'what', 'is', 'your', 'name', 'and', 'email', 'is', 'it', 'hello', 'email', 'com', 'or', 'xyz']
Searching the phrase using recheck '[A-Z][a-z]+'
['Hello']


In [36]:
test_phrases = 'This is a string with some numbers 1233 and a symbol #hashtag'

test_patterns=[ r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ]

multi_re_find(test_patterns,test_phrases)

Searching the phrase using recheck '\\d+'
[]
Searching the phrase using recheck '\\D+'
['Hello what is your name and email is it hello@email.com or xyz']
Searching the phrase using recheck '\\s+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
Searching the phrase using recheck '\\S+'
['Hello', 'what', 'is', 'your', 'name', 'and', 'email', 'is', 'it', 'hello@email.com', 'or', 'xyz']
Searching the phrase using recheck '\\w+'
['Hello', 'what', 'is', 'your', 'name', 'and', 'email', 'is', 'it', 'hello', 'email', 'com', 'or', 'xyz']
Searching the phrase using recheck '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '@', '.', ' ', ' ']
