In [1]:
# checking a text is inside of another text
text = 'my phone number is 408-555-2341. Call soon!'
print('phone' in text)

True


In [2]:
# finding a text in a text with re library
# search returns the very first match
import re
pattern = '408-555-2341'
re.search(pattern, text)

<re.Match object; span=(19, 31), match='408-555-2341'>

In [3]:
# looking to the details of the match
my_match = re.search(pattern, text)
my_match.span(), my_match.start(), my_match.end()

((19, 31), 19, 31)

In [4]:
# finding the number of matches in a text
text2 = 'my phone is a new phone'
all_matches = re.findall('phone', text2)
len(all_matches)

2

In [5]:
# iterating over different matches
for match in re.finditer('phone', text2):
    print(match.span())

(3, 8)
(18, 23)


### identifying the patterns

searching format for patterns is r'mypattern'. \ will be used a lot in pattern and to prevent the escape slash 'r' is placed at the beginning of the pattern.

all possible identifiers are listed below.

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [6]:
# defining and searching for a pattern in a text
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'
re.search(pattern, text)

<re.Match object; span=(19, 31), match='408-555-2341'>

In [7]:
# searching for a pattern in a text and listing the results
phone = re.search(pattern, text)
phone.group()

'408-555-2341'

In [8]:
# finding multiple results
text = 'phone numbers: 221-123-2234, 331-225-6512, 444-121-6577'
re.findall(pattern, text)

['221-123-2234', '331-225-6512', '444-121-6577']

#### Quantifiers

Now that we know the special character designations, we can use them along with quantifiers to define how many we expect.

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [9]:
# finding multiple results for quantifief patterns
new_pattern = r'\d{3}-\d{3}-\d{4}'
re.findall(new_pattern, text)

['221-123-2234', '331-225-6512', '444-121-6577']

In [10]:
# grouping the pattern. each parantheses represents a group now and it could be used to find the area codes
new_pattern2 = r'(\d{3})-(\d{3})-(\d{4})'
re.search(new_pattern2, text).group(1)

'221'

In [11]:
# or operator
re.search(r'man|woman','there is a man over there')

<re.Match object; span=(11, 14), match='man'>

In [12]:
# using wildcard character to find word pieces
re.findall(r'.at', 'this cat has a big hat after he sat splat')

['cat', 'hat', 'sat', 'lat']

In [13]:
# finding patterns with starting and ending character
# ^ starts with, $ ends with
re.findall(r'\d$', 'this ends with number 2') 

['2']

In [14]:
# excluding characters with squared brackets and carrot [^\w]. 
# by the '+' the words are hold together, if '+' is not used all the chars will be listed
re.findall(r'[^\d]+','there are 3 diff 45 numbers in 1 line')

['there are ', ' diff ', ' numbers in ', ' line']

In [15]:
# removing punctutation from a text. note that adding a space in the squared brackets split the sentences by space. if space is removed the sentence will be splitted by punctuations
re.findall(r'[^.!? ]+','there are some punc in this sentence! how to remove? that.')

['there',
 'are',
 'some',
 'punc',
 'in',
 'this',
 'sentence',
 'how',
 'to',
 'remove',
 'that']

In [16]:
# joining the result
phrase = 'there are some punc in this sentence! how to remove? that.'
' '.join(re.findall(r'[^.!? ]+', phrase))

'there are some punc in this sentence how to remove that'

In [17]:
# finding the hyphen-words
re.findall(r'[\w]+-[\w]+', 'only find the hyphen-words which is like long-ish')

['hyphen-words', 'long-ish']

In [18]:
# finding words start with something and ends with some options

text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

print(re.search(r'cat(fish|nap|claw)',text))
print(re.search(r'cat(fish|nap|claw)',texttwo))
print(re.search(r'cat(fish|nap|claw)',textthree))

<re.Match object; span=(27, 34), match='catfish'>
<re.Match object; span=(32, 38), match='catnap'>
None


For full information on all possible patterns, check out: https://docs.python.org/3/howto/regex.html