-
Notifications
You must be signed in to change notification settings - Fork 2
/
电子书txt爬取(urllib or urllib2)
114 lines (109 loc) · 3.97 KB
/
电子书txt爬取(urllib or urllib2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
#coding=utf-8
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
import string
import time
#进行每一章的爬取和保存
def fun(url,filename):
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
req = urllib2.Request(url,headers=headers)
# 单次连接会出现错误,错误后休息1s后继续尝试连接
while True:
try:
rep = urllib2.urlopen(req)
break
except BaseException:
time.sleep(1)
html = rep.read()
soup = BeautifulSoup(html,'html.parser')
name = soup.find(class_="h1title").h1.get_text()
text = soup.find(class_="contentbox").get_text()
text = text.strip().split('\n')[0]
text = text.split()
filename.writelines('************'+name.encode('gbk','ignore')+'**************'+'\n\n')
for i in text:
filename.writelines(' '+ i.encode('gbk','ignore')+'\n\n')
print '%s is ok' % name
time.sleep(0.05)
#进一步寻找小说章数
def search_data(soup,filename):
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
wenzhang_base_url = soup.find(class_="result-game-item-title-link")
href = re.compile(r'href="(.+?)"')
#找到小说章数页面 base url
wenzhang_base_url = re.findall(href,str(wenzhang_base_url))[0]
#访问小说页面 获取html
req = urllib2.Request(wenzhang_base_url,headers=headers)
html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
#使用Beautifulsoup寻找每一章
soup1 = BeautifulSoup(html,'html.parser')
little_list = soup1.find('ul',class_='mulu_list')
little_list = re.findall(re.compile(r'<li>(.+?)</li>'),str(little_list))
for i in little_list:
href = re.compile(r'href="(.+?)"')
href = re.findall(href,i)
wenzhang_url = wenzhang_base_url + href[0]
#print wenzhang_url
fun(wenzhang_url,filename)
#模拟登陆,搜索是否有着本书
def sousuo(url):
base_url = 'http://so.ybdu.com/cse/search?'
name = raw_input('请输入你要寻找的电子书完整名称:')
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
data = urllib.urlencode({
'q':name,
's':'14402670595036768243',
'entry':'1'
})
real_url = base_url + data
req = urllib2.Request(real_url,headers=headers)
time.sleep(1)
#单次连接会出现错误,错误后休息1s后继续尝试连接
while True:
try:
rep = urllib2.urlopen(req)
break
except BaseException:
time.sleep(0.05)
html = rep.read()
soup = BeautifulSoup(html,'html.parser')
try:
sousuoname = soup.find(class_="result-game-item-title-link").get_text().strip().strip('\n')
except BaseException:
print '搜索失败'
return
if cmp(sousuoname.encode('utf-8'),name)==0:
print '搜索到 《%s》' % sousuoname.encode('utf-8')
print '是否进行下载? (y/n)'
choice = raw_input()
if choice == 'y':
name = (name+'.txt').decode('utf-8')
filename = open(name,'a')
search_data(soup,filename)
filename.close()
print '谢谢使用'
else :
print '谢谢使用'
else:
print '抱歉,没有找到您搜索的内容'
print '根据您搜索内容给您推荐《%s》' % sousuoname.encode('utf-8')
print '是否进行下载? (y/n)'
choice =raw_input()
if choice == 'y':
sousuoname = sousuoname +'.txt'
#print sousuoname
filename = open(sousuoname, 'a')
search_data(soup, filename)
filename.close()
print '谢谢使用'
else:
print '谢谢使用'
#主函数
def main():
base_url = 'http://www.ybdu.com/'
sousuo(base_url)
if __name__ == '__main__':
main()