/
Hidden corner.py
69 lines (61 loc) · 2.47 KB
/
Hidden corner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
'''
requests + xpath爬取豆瓣《隐秘的角落》评价
'''
import requests
import pandas as pd
from lxml import etree
# 爬取页面url
douban_url = 'https://movie.douban.com/subject/33404425/comments?status=P'
# 添加头部
headers = {
'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'
}
def reponse_data(url, headers):
# requests发送请求
get_reponse = requests.get(url, headers=headers)
# 将返回的数据转换为文本
get_data = get_reponse.text
# 解析页面
tree = etree.HTML(get_data)
return tree
# 第一页
first_page = reponse_data(douban_url, headers)
'''
用户的xpath为: '/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/h3/span[2]/a'
评论内容xpath为:'/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/p/span'
看过的用户xpath为:'/html/body/div[3]/div[1]/div/div[1]/div[1]/ul/li[1]/span'
'''
# 看过电影的人数
comment_counts = first_page.xpath(
'/html/body/div[3]/div[1]/div/div[1]/div[1]/ul/li[1]/span/text()')
comment_counts = int(comment_counts[0].strip("看过()"))
# 总的页面数(每页有20条评论)
pages = int(comment_counts / 20)
'''
理想情况下是全部都能爬到,但是很遗憾测试的时候只能爬到不到280条数据,为了方便起见,只爬了260条数据
'''
for i in range(13):
# 当前页面url
page_url = 'https://movie.douban.com/subject/33404425/comments?start={0}&limit=20&sort=new_score&status=P'.format(
i*20)
tree = reponse_data(page_url, headers)
# 获取用户和评论的内容
commentator = tree.xpath(
'/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/h3/span[2]/a/text()')
comment_content = tree.xpath(
'/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/p/span/text()')
# 解析内容
content = [' ' for i in range(len(commentator))]
for i in range(len(commentator)):
comment_content[i].strip(r'\n')
comment_content[i].strip(' ')
content[i] = [commentator[i], comment_content[i]]
name = ['用户', '评论内容']
file = pd.DataFrame(columns=name, data=content)
if i == 0:
file.to_csv(r'./豆瓣影评/comment_content.cvs',
encoding='utf-8', index=False)
else:
# 表示追加写
file.to_csv(r'./豆瓣影评/comment_content.cvs', mode='a+',
encoding='utf-8', index=False)