# PyQuery

In [1]:
html = """
<div id="container" class="box">
    <ul type="disc" class="list" id="list-1">
        <li class="element-0">苹果</li>
        <li class="element-1">香蕉</li>
        <li class="element">柠檬</li>
    </ul>  
    <ul type="circle" class="list list-small" id="list-2">
        <li class="element-0 active" id="apple-2">苹</li>
        <li class="element-1 active"><a href="http://www.baidu.com">香</a></li>
        <li class="element">柠</li>
    </ul>
</div>
"""
from pyquery import PyQuery as pq

# 字符串初始化
doc = pq(html)
# print(doc('li'))

# url初始化
print("--------------======== url初始化 ========-------------")
docUrl = pq(url="http://www.baidu.com")
print(docUrl('head'))

# 文件初始化 -> 报错...
# doc = pq(filename="D:\TempFiles\test.html",encoding="UTF-8")
# print(doc('li'))

# 基本CSS选择器
print("--------------======== 基本CSS选择器 ========-------------")
print(doc('#container .list li')) # 等效于 print(doc('li'))


# 查找元素
# 子元素
print("--------------======== 子元素 ========-------------")
items = doc('.list')
print(type(items))
print(items)
print("--------------======== find('li') ========-------------")
lis = items.find('li')
print(lis)
print("--------------======== find('#apple-2') ========-------------")
myid = items.find('#apple-2')
print(myid)
print(myid.text())
print("--------------======== children('.active') ========-------------")
items = doc('.list')
children = items.children('.active')
print(children)

# 父元素
print("--------------======== 父元素 ========-------------")
print("--------------======== parent ========-------------")
items = doc('#apple-2')
parent = items.parent()
print(parent)
print("--------------======== parents ========-------------")
parents = items.parents()
print(parents)
print("--------------======== parents('.list') ========-------------")
parent = items.parents('.list')
print(parent)

# 兄弟元素 注: 后面两个class属性相连接,中间没有空格
print("--------------======== 兄弟元素 ========-------------")
brothers = doc('.list .element-0.active')
print(brothers.siblings())
print(brothers.siblings('.active'))

# 遍历
print("--------------======== 遍历 ========-------------")
items = doc('li').items()
# print(items)
for li in items:
    print(li)

# 获取属性
# 🍎 如何获取 class ???
print("--------------======== 获取属性 ========-------------")
a = doc('.element-1.active a')
print(a)
print(a.attr['href'])
print(a.attr.href)
print("--------------======== 获取文本 ========-------------")
print(a.text())
print("--------------======== 获取HTML ========-------------")
a = doc('.element-1.active')
print(a)
print(a.html())


<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head> 
<li class="element-0">苹果</li>
        <li class="element-1">香蕉</li>
        <li class="element">柠檬</li>
    <li class="element-0 active" id="apple-2">苹</li>
        <li class="element-1 active"><a href="http://www.baidu.com">香</a></li>
        <li class="element">柠</li>
    
<class 'pyquery.pyquery.PyQuery'>
<ul type="disc" class="list" id="list-1">
        <li class="element-0">苹果</li>
        <li class="element-1">香蕉</li>
        <li class="element">柠檬</li>
    </ul>  
    <ul type="circle" class="list list-small" id="list-2">
        <li class="element-0 active" id="apple-2">苹</li>
        <li class="element-1 active"><a href="http://www.baidu.com">香</a></li>
        <

## DOM操作

In [123]:
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq

doc = pq(html)

# addClass & removeClass
print("--------------======== addClass & removeClass ========-------------")
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)

# attr & css
print("--------------======== attr & css ========-------------")
li.attr('name','link')
print(li)
li.css('font-size','16px')
print(li)

# remove
print("--------------======== remove ========-------------")
html = """
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
</div>
"""
from pyquery import PyQuery as pq

doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
# print(wrap.html())
wrap.find('p').remove()
print("--------------================-------------")
print(wrap.text())


<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active" name="link" style="font-size: 16px"><a href="link3.html"><span class="bold">third item</span></a></li>
             
Hello, World
This is a paragraph.
Hello, World


### 更多DOM方法
https://pyquery.readthedocs.io/en/latest/api.html

## 伪类选择器

In [2]:
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)') # 2n 指2的倍数; 3n 指3的倍数 -> 从1开始计数
print(li)
li = doc('li:contains(fourth)')
print(li)

<li class="item-0">first item</li>
             
<li class="item-0"><a href="link5.html">fifth item</a></li>
        
<li class="item-1"><a href="link2.html">second item</a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             


In [39]:
%pwd

'C:\\fastwork\\Python\\JupyterNotebook'

In [38]:
import re
from pyquery import PyQuery as pq

html = """
<div class="p-name p-name-type-2">
							<a target="_blank" title="网红零食大礼包大牌直降；买一箱送一箱；日式小圆饼单包4.95；嗨吃家酸辣粉等份量满满，戳我立抢！！杉城吃货节" href="//item.jd.com/100012076422.html" onclick="searchlog(1, '100012076422','1','1','','flagsClk=2097626');">
								<em><span class="p-tag" style="background-color:#c81623">京东超市</span>	
杉城 圣诞节礼物 肉类休闲零食大礼包一整箱送女友女生儿童礼盒<font class="skcolor_ljg">美食</font>品超市好吃的组合装1800g</em>
								<i class="promo-words" id="J_AD_100012076422">网红零食大礼包大牌直降；买一箱送一箱；日式小圆饼单包4.95；嗨吃家酸辣粉等份量满满，戳我立抢！！杉城吃货节</i>
							</a>
						</div>
"""
# r = re.search('</span>.*?(.*?).*?</em>', html, re.S)[0]
# print(r)
# print('----------------')
# print(re.sub('<[^>]*>', '', r_))

doc = pq(html)

item = doc('.p-name em')
item.find('span').remove()

product = {
    'title': item.text().replace('\n', '')
}

print(product)


{'title': '杉城 圣诞节礼物 肉类休闲零食大礼包一整箱送女友女生儿童礼盒美食品超市好吃的组合装1800g'}


In [6]:
html = """
<div id="container" class="box">
    <ul type="disc" class="list" id="list-1">
        <li class="element-0">苹果</li>
        <li class="element-1">香蕉</li>
        <li class="element">柠檬</li>
    </ul>  
    <ul type="circle" class="aaa" id="list-2">
        <li class="element-0 active" id="apple-2">苹</li>
        <a target="_blank" 
        title="零食大礼包休闲食品卤味肉类充饥80包 买一箱香辣））" href="//item.jd.com/10030146281552.html" onclick="searchlog(1, '10030146281552','1','1','','flagsClk=2097165');">
							</a>
        <a title="另外一个a标签"></a>
        <li class="element-1 active"><a href="http://www.baidu.com">香</a></li>
        <li class="element">柠</li>
    </ul>
</div>
"""
from pyquery import PyQuery as pq

doc = pq(html)

items = doc('#container').items()
for item in items:
    title = item.find('.aaa a').attr('title')
    print(title)
    


另外一个a标签


### 更多CSS选择器👉https://www.w3school.com.cn/css/index.asp

## 官方文档
https://pyquery.readthedocs.io/

## 总结

```python
doc = pq(html)
doc('id/class/tag').items/find/children/parent/parents/siblings/attr/text/html/addClass/removeClass/remove
```
