## 通过一个小实例来了解正则表达式的作用

In [1]:
import re

In [2]:
s = '123abc456eabc789'
print(re.findall(r'abc', s))
print(re.findall(r'[0-9]+', s))


['abc', 'abc']
['123', '456', '789']


In [8]:
s = 'Chapter1 Chapter2 Chapter10 Chapter99 fheh'
print(re.findall('Chapter[1-9][0-9]*', s))
print(re.findall('Chapter[1-9][0-9]+', s))
print(re.findall('Chapter[1-9][0-9]{0,1}', s))
print(re.findall('Chapter[1-9][0-9]{1,2}', s))

['Chapter1', 'Chapter2', 'Chapter10', 'Chapter99']
['Chapter10', 'Chapter99']
['Chapter1', 'Chapter2', 'Chapter10', 'Chapter99']
['Chapter10', 'Chapter99']


In [10]:
# 贪婪模式
s = '<H1>Chapter 1 – Introduction to Regular Expressions</H1>'
print(re.findall('<.*>', s))
# 非贪婪模式
re.findall('<.*?>', s)

['<H1>Chapter 1 – Introduction to Regular Expressions</H1>']


['<H1>', '</H1>']

In [11]:
s = 'Chapter1 Chapter2 Chapter11 Chapter99'
print(re.findall('^Chapter[1-9][0-9]{0,1}', s))
# result: ['Chapter1']
print(re.findall('^Chapter[1-9][0-9]{0,1}$', 'Chapter99'))
# result: ['Chapter99']
print(re.findall(r'\bCha', ' Chapter'))
# result: ['Cha']
print(re.findall(r'ter\b', ' Chapter'))
# result: ['ter']
print(re.findall(r'\Bapt', 'Chapter'))
# result: ['apt']
print(re.findall(r'\Bapt', 'aptitude'))
# result: []

['Chapter1']
['Chapter99']
['Cha']
['ter']
['apt']
[]


In [34]:
s = 'aaa111aaa , bbb222 , 333ccc'
print(re.findall(r'[a-z]+(\d+)[a-z]', s))
print(re.findall(r'[a-z]+\d+[a-z]', s))
print(re.findall(r'[a-z]+\d+[a-z]+', s))

['111']
['aaa111a']
['aaa111aaa']


In [39]:
s = '111aaa222aaa111 , 333bbb444bb33'
print(re.findall(r'(\d+)([a-z]+)(\d+)(\2)(\1)', s))
print(re.findall(r'(\d+)([a-z]+)(\d+)(\2)(\1)', '333bbb444bb33'))
print(re.findall(r'(\d+)([a-z]+)(\d+)(\2)(\1)', '333bbb444bbb333'))
print(re.findall(r'(\d+)([a-z]+)(\d+)(\1)(\2)', '333bbb444bbb333'))

[('111', 'aaa', '222', 'aaa', '111')]
[]
[('333', 'bbb', '444', 'bbb', '333')]
[]


In [40]:
s = 'industry is industries lala industyyy industiii'
print(re.findall(r'industr(?:y|ies)', s))

['industry', 'industries']


In [43]:
s = 'Windows2000 Windows3.1'
re.findall(r'Windows(?=95|98|NT|2000)', s)
# 匹配 "Windows2000" 中的 "Windows",不匹配 "Windows3.1" 中的 "Windows"

['Windows']

In [44]:
s = 'Windows2000 Windows3.1'
re.findall(r'Windows(?!95|98|NT|2000)', s)
# 匹配 "Windows3.1" 中的 "Windows",不匹配 "Windows2000" 中的 "Windows"

['Windows']

In [45]:
s = 'aaa111aaa,bbb222,333ccc,444ddd444,555eee666,fff777ggg'
print(re.findall(r'([a-z]+)\d+([a-z]+)', s))
# result:[('aaa', 'aaa'), ('fff', 'ggg')]
print(re.findall(r'(?P<g1>[a-z]+)\d+(?P=g1)', s))
# result:['aaa']
print(re.findall(r'(?P<g1>[a-z]+)\d+(?P=g1)', 'aaa111aaa,bbb222,333ccc,444ddd444,555eee666,fff777fff'))
# result:['aaa', 'fff']
print(re.findall(r'[a-z]+(\d+)([a-z]+)', s))
# result: [('111', 'aaa'), ('777', 'ggg')]
print(re.findall(r'([a-z]+)\d+', s))
# result:['aaa', 'bbb', 'ddd', 'eee', 'fff']
print(re.findall(r'([a-z]+)\d+\1', s))
# result:['aaa']

[('aaa', 'aaa'), ('fff', 'ggg')]
['aaa']
['aaa', 'fff']
[('111', 'aaa'), ('777', 'ggg')]
['aaa', 'bbb', 'ddd', 'eee', 'fff']
['aaa']


In [46]:
s = 'I have a dog , I have a cat'
print(re.findall(r'I have a (?:dog|cat)', s))
# result: ['I have a dog', 'I have a cat']
re.findall(r'I have a dog|cat', s)
# result: ['I have a dog', 'cat']

['I have a dog', 'I have a cat']


['I have a dog', 'cat']

In [47]:
s = 'ababab abbabb aabaab abbbbbab'
print(re.findall(r'\b(?:ab)+\b', s))
# result: ['ababab']
re.findall(r'\b(ab)+\b', s)
# result: ['ab']

['ababab']


['ab']

In [20]:
s = 'da12bka3434bdca4343bdca234bm'
print(re.findall(r'(?<=[^c]a)\d*(?=bd)', s))
print(re.findall(r'[^c]a\d*bd', s))

['3434']
['ka3434bd']


In [9]:
s = '5569'
print(re.findall(r'(?<=[^4])56(?=9)', s))

['56']


In [48]:
s1 = "once upon a time"
s2 = "There once was a man from NewYork"
print(re.findall(r'^once', s1))
# result: ['once']
print(re.findall(r'^once', s2))
# result: []
print(re.findall(r'time$', s1))
# result: ['time']
print(re.findall(r'times$', s1))
# result: []
print(re.findall(r'^time$', s1))
# result: []
print(re.findall(r'^time$', 'time'))
# result: ['time']

['once']
[]
['time']
[]
[]
['time']


In [49]:
s = '111,222,aaa,bbb,ccc333,444ddd'
rule = r'\b\d+\b'
compiled_rule = re.compile(rule)
print(compiled_rule.findall(s))

['111', '222']


In [50]:
print(re.match('www', 'www.runoob.com').span())  # 在起始位置匹配
# result: (0, 3)
print(re.match('com', 'www.runoob.com'))  # 不在起始位置匹配
# result: None

(0, 3)
None


In [3]:
line = "Cats are smarter than dogs"
matchObj = re.match(r'(.*) are (.*?) .*', line, re.I|re.M)
print(re.match(r'(.*) are (.*?) .*', line))
if matchObj:
    print("matchObj.group() : ", matchObj.group())
    print("matchObj.group(1) : ", matchObj.group(1))
    print("matchObj.group(2) : ", matchObj.group(2))
else:
    print("No match!!")

<re.Match object; span=(0, 26), match='Cats are smarter than dogs'>
matchObj.group() :  Cats are smarter than dogs
matchObj.group(1) :  Cats
matchObj.group(2) :  smarter


In [55]:
print(re.search('www', 'www.runoob.com').span())  # 在起始位置匹配
# result: (0, 3)
print(re.search('com', 'www.runoob.com').span())  # 不在起始位置匹配
# result: (11, 14)

(0, 3)
(11, 14)


In [57]:
line = "Cats are smarter than dogs";
searchObj = re.search(r'(.*) are (.*?) .*', line, re.M | re.I)

if searchObj:
    print("searchObj.group() : ", searchObj.group())
    # result: searchObj.group() :  Cats are smarter than dogs
    print("searchObj.group(1) : ", searchObj.group(1))
    # result: searchObj.group(1) :  Cats
    print("searchObj.group(2) : ", searchObj.group(2))
    # result: searchObj.group(2) :  smarter
else:
    print("Nothing found!!")

searchObj.group() :  Cats are smarter than dogs
searchObj.group(1) :  Cats
searchObj.group(2) :  smarter


In [58]:
phone = "2004-959-559 # This is Phone Number"
num = re.sub(r'#.*$', "", phone)
print("Phone Num : ", num)
# result: Phone Num :  2004-959-559

num = re.sub(r'\D', "", phone)
print("Phone Num : ", num)
# result: Phone Num :  2004959559

Phone Num :  2004-959-559 
Phone Num :  2004959559


In [59]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())
images = bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
    print(image["src"])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg
