### xpath解析
    - 通用性比较强
    - 环境的安装: pip install lxml
    - 解析原理:
        - 1. 实例化一个etree对象，且将解析的页面源码加载到该对象中
        - 2. 使用该对象中的xpath方法结合着xpath表达式进行标签定位和数据解析提取
    - etree对象的实例化:
        - 本地加载: 
            tree = etree.parse("filePath")
        - 网络加载: 
            tree = etree.HTML(page_text)

#### 常用的xpath表达式: 基于标签的层级实现定位,返回的永远是一个列表
    - /: 从标签开始实现层级定位
    - //: 从任意位置实现标签的定位
    - 属性定位: tag[@attrName="attrValue"]
    - 索引定位: //div[@class="tang"]/ul/li[5] 注意索引值是从1开始
    - 取文本:
        - 取直系文本内容: /text()
        - 取所有文本内容: //text()
    - 取属性: /@attrName

In [11]:
from lxml import etree
tree = etree.parse("./test_page.html")
tree.xpath('//div[@class="tang"]')

[<Element div at 0x218f0c6a708>]

In [8]:
from lxml import etree
tree = etree.parse("./test_page.html")
# tree.xpath("/html/body")
tree.xpath("//div")

[<Element li at 0x207eba718c8>,
 <Element li at 0x207eba8d4c8>,
 <Element li at 0x207eba8d708>,
 <Element li at 0x207eba8d908>,
 <Element li at 0x207eba8d948>,
 <Element li at 0x207eba8db08>,
 <Element li at 0x207eba8db48>,
 <Element li at 0x207eba8db88>]

In [12]:
tree.xpath("//div[@class='song']")

[<Element div at 0x218f1b53048>,
 <Element div at 0x218f0c9f708>,
 <Element div at 0x218f0c6a708>]

In [14]:
tree.xpath("//div[@class='tang']/ul/li[3]")

[<Element li at 0x218f0ca4388>]

In [19]:
tree.xpath('//div[@class="song"]/p[4]/text()')[0]

'柳宗元'

In [21]:
text_list = tree.xpath('//div[@class="song"]//text()')
text_str = "".join(text_list)
print(text_str)


        李清照
        王安石
        苏轼
        柳宗元
        
            this is span
        宋朝是最强大的王朝，不是军队的强大，而是经济很强大，国民都很有钱
        总为浮云能蔽日,长安不见使人愁
        
    


In [23]:
tree.xpath('//div[@class="tang"]/ul/li[1]/a/@href')[0]

'http://www.baidu.com'

In [24]:
tree.xpath('//div[@class="song"]/img/@src')

['http://www.baidu.com/meinv.jpg']

In [27]:
# 爬取58二手房的房源信息
import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36"
}
url = "https://huizhou.58.com/ershoufang/?PGTID=0d100000-002d-27b3-b281-6cc1a61855f7&ClickID=2"
page_text = requests.get(url=url, headers=headers).text

# 数据解析(房屋的名称和价格)
tree = etree.HTML(page_text)  # 实例化一个etree对象
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')  # 使用对象的xpath方法结合xpath表达式进行数据提取

all_data_list = list()
for li in li_list:
    title = li.xpath('./div[2]/h2/a/text()')[0]  # ./表示的就是li标签
    detail_url = li.xpath('./div[2]/h2/a/@href')[0]
    price = li.xpath('./div[3]//text()')
    price = "".join(price)
    if "https:" not in detail_url:
        detail_url = "https:" + detail_url

    # 获取详情页的页面源码数据，提取解析出房屋概况
    detail_page_text = requests.get(url=detail_url, headers=headers).text
    tree = etree.HTML(detail_page_text)
    desc = tree.xpath('//div[@id="generalSituation"]//text()')
    desc = "".join(desc)
    
    dic = {
        "title": title,
        "price": price,
        "desc": desc
    }
    all_data_list.append(dic)

print(len(all_data_list), all_data_list)

132 [{'title': '送学.位 北站旁 中洲大盘自带12年公立学.校 \xa0', 'price': '\n                            71万\n                            7855元/㎡\n                        ', 'desc': '\n            概况\n            \n                \n                                            \n                            房屋总价\n                            \n                                    齤閏万(单价齤龥龒龒元/㎡)\n                                \n                        \n                                                                \n                            房屋户型\n                            3室2厅2卫\n                        \n                                                                \n                            房本面积\n                            90.38㎡\n                        \n                                                                \n                            房屋朝向\n                            南\n                        \n                                    \n                \n                          

In [28]:
# 需求: 爬取当前页面全部的城市名称https://www.aqistudy.cn/historydata/
url = "https://www.aqistudy.cn/historydata/"
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)

# 热门城市: //div[@class="bottom"]/ul/li/a/text()
# 全部城市: //div[@class="bottom"]/ul/div[2]/li/a/text()
all_city_names = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
# 最重要的一个知识点，就是管道符|, 它表示或的意思，只要满足其中一个条件就可以匹配到

print(len(all_city_names), all_city_names)


394 ['北京', '上海', '广州', '深圳', '杭州', '天津', '成都', '南京', '西安', '武汉', '阿坝州', '安康', '阿克苏地区', '阿里地区', '阿拉善盟', '阿勒泰地区', '安庆', '安顺', '鞍山', '克孜勒苏州', '安阳', '蚌埠', '白城', '保定', '北海', '宝鸡', '北京', '毕节', '博州', '白山', '百色', '保山', '白沙', '包头', '保亭', '本溪', '巴彦淖尔', '白银', '巴中', '滨州', '亳州', '长春', '昌都', '常德', '成都', '承德', '赤峰', '昌吉州', '五家渠', '昌江', '澄迈', '重庆', '长沙', '常熟', '楚雄州', '朝阳', '沧州', '长治', '常州', '潮州', '郴州', '池州', '崇左', '滁州', '定安', '丹东', '东方', '东莞', '德宏州', '大理州', '大连', '大庆', '大同', '定西', '大兴安岭地区', '德阳', '东营', '黔南州', '达州', '德州', '儋州', '鄂尔多斯', '恩施州', '鄂州', '防城港', '佛山', '抚顺', '阜新', '阜阳', '富阳', '抚州', '福州', '广安', '贵港', '桂林', '果洛州', '甘南州', '固原', '广元', '贵阳', '甘孜州', '赣州', '广州', '淮安', '海北州', '鹤壁', '淮北', '河池', '海东地区', '邯郸', '哈尔滨', '合肥', '鹤岗', '黄冈', '黑河', '红河州', '怀化', '呼和浩特', '海口', '呼伦贝尔', '葫芦岛', '哈密地区', '海门', '海南州', '淮南', '黄南州', '衡水', '黄山', '黄石', '和田地区', '海西州', '河源', '衡阳', '汉中', '杭州', '菏泽', '贺州', '湖州', '惠州', '吉安', '金昌', '晋城', '景德镇', '金华', '西双版纳州', '九江', '吉林', '即墨', '江门', '荆门', '佳木斯', '济南', '济宁', '胶南', '酒泉', '句容', '湘西州

In [27]:
# 处理中文乱码
import requests
from lxml import etree

In [33]:
url = "http://pic.netbian.com/4kqiche/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36"
}
response = requests.get(url=url, headers=headers)
# 手动设置响应数据的编码格式
# response.encoding = "utf-8"
page_text = response.text

tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
    img_src = "http://pic.netbian.com" + li.xpath('./a/img/@src')[0]
    img_name = li.xpath('./a/b/text()')[0]
    img_name = img_name.encode("iso-8859-1").decode("gbk")
    
    print(img_src, img_name)

http://pic.netbian.com/uploads/allimg/190402/151941-155418958155f4.jpg 2019年法拉利Portofino跑
http://pic.netbian.com/uploads/allimg/190318/234605-1552923965b21b.jpg 2019年劳斯莱斯幽灵黑徽
http://pic.netbian.com/uploads/allimg/190318/234319-155292379969c4.jpg 2019 McLaren Senna GTR
http://pic.netbian.com/uploads/allimg/190318/233711-15529234313669.jpg 2019年劳斯莱斯幽灵黑徽
http://pic.netbian.com/uploads/allimg/190305/213253-15517927738828.jpg 迈凯伦McLaren 600LT Sp
http://pic.netbian.com/uploads/allimg/190228/115955-15513263954ed8.jpg 白色劳斯莱斯5k图片
http://pic.netbian.com/uploads/allimg/190123/203717-1548247037a480.jpg 《迈凯伦720S GT3》4k壁
http://pic.netbian.com/uploads/allimg/181203/221810-154384669040bd.jpg 兰博基尼Lamborghini Ur
http://pic.netbian.com/uploads/allimg/181203/221741-154384666105fa.jpg 保时捷Porsche 911 Carr
http://pic.netbian.com/uploads/allimg/181103/202531-1541247931e4a5.jpg 兰博基尼LP580橙色跑车4
http://pic.netbian.com/uploads/allimg/180912/223122-153676268234b8.jpg 奔驰银箭Mercedes-Benz 
http://pic.netbian.com/uploa