-
Notifications
You must be signed in to change notification settings - Fork 51
/
wxhub.py
468 lines (399 loc) · 15.5 KB
/
wxhub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import re
import shutil
import os
import json
import argparse
import traceback
import random
import math
import codecs
class Input:
fake_name = ""#"影想"
out_dir = "output"
'''
all_images
baidu_pan_links
whole_page
pipe
'''
crawl_method = "all_images"
url_cache = {}
arti_cache = {}
page_sleep = 1
page_limit = -1
args = {}
custom_pipe = []
class Session:
token = ''
cookies = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
class Urls:
index = 'https://mp.weixin.qq.com'
editor = 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit&action=edit&type=10&isMul=1&isNew=1&share=1&lang=zh_CN&token={token}'
query_biz = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&token={token}&lang=zh_CN&f=json&ajax=1&random={random}&query={query}&begin={begin}&count={count}'
query_arti = 'https://mp.weixin.qq.com/cgi-bin/appmsg?token={token}&lang=zh_CN&f=json&%E2%80%A65&action=list_ex&begin={begin}&count={count}&query={query}&fakeid={fakeid}&type=9'
class BaseResp:
def __init__(self, sjson):
self.data = json.loads(sjson)
self.base_resp = self.data['base_resp']
@property
def ret(self):
return self.base_resp['ret']
@property
def err_msg(self):
return self.base_resp['err_msg']
@property
def is_ok(self):
return self.base_resp['ret'] == 0
class FakesResp(BaseResp):
def __init__(self, sjson):
super(FakesResp, self).__init__(sjson)
self.list = self.data['list']
self.total = self.data['total']
@property
def count(self):
return len(self.list)
class ArtisResp(BaseResp):
def __init__(self, sjson):
super(ArtisResp, self).__init__(sjson)
self.list = self.data['app_msg_list'] if self.is_ok else []
self.total = self.data['app_msg_cnt'] if self.is_ok else 0
@property
def count(self):
return len(self.list)
def execute_times(driver, times):
for i in range(times + 1):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
def login(driver):
pass
def read_url_set():
ret = {}
fn = os.path.join('output', '__urls.json')
if os.path.isdir('output') and os.path.isfile(fn):
with open(fn, 'rt') as f:
ret = json.load(f)
return ret
def write_url_set(urls):
fn = os.path.join('output', '__urls.json')
if not os.path.isdir('output'):
shutil.os.makedirs('output', exist_ok=True)
with open(fn, 'wb') as f:
f.write(json.dumps(urls).encode('utf-8'))
def set_cookies(driver, cookies):
Session.cookies = {}
for item in cookies:
driver.add_cookie(item)
Session.cookies[item['name']]=item['value']
def download(url, sname):
for i in range(0, 3):
result = requests.get(url, headers=Session.headers, stream=True)
if result.status_code == 200:
with open(sname, 'wb') as f:
for chunk in result.iter_content(1024):
f.write(chunk)
return True
else:
continue
print(f"Error download:{url}")
return False
def pipe_fakes(fake_name):
begin = 0
count = 5
while(True):
rep = requests.get(Urls.query_biz.format(random=random.random(), token=Session.token, query=fake_name, begin=begin, count=count), cookies=Session.cookies, headers=Session.headers)
fakes = FakesResp(rep.text)
if not fakes.is_ok:
break
i = 0
for it in fakes.list:
print(f"{i}) {it['nickname']}")
i = i + 1
while(True):
ic = input("输入数字, 选择序号;或者输入n翻页:")
try:
if ic == 'n' or int(ic) >= 0 and int(ic) < len(fakes.list):
break
except ValueError:
print("输入错误, 请重新输入!")
continue
if ic == 'n' or ic == 'N':
begin = begin + fakes.count
continue
return fakes.list[int(ic)]
def pipe_articles(fakeid, query=''):
TIME_SLEEP = Input.page_sleep
todo = load_todo_list(Input.fake_name)
if not todo:
todo['data'] = {}
data = todo['data']
mask = list(todo['__mask'] if '__mask' in todo else '')
last_total = todo['__total_cnt'] if '__total_cnt' in todo else 0
begin = 0
pagesize = 5
total = 0
total_page = 0
last_total_page = math.ceil(last_total / pagesize)
page_limit = Input.page_limit
rep = requests.get(Urls.query_arti.format(token=Session.token, fakeid=fakeid, begin=begin, count=pagesize, query=query), cookies=Session.cookies, headers=Session.headers)
artis = ArtisResp(rep.text)
if not artis.ret and page_limit:
total = artis.total
total_page = math.ceil(total / pagesize)
if total_page > last_total_page:
mask = (total_page - last_total_page) * ['0'] + mask
if artis.list[0]['link'] in data:
mask[0] = '0' #has new arti. reset first page.
print(f"正在获取全部链接, 共发现 {artis.total} 条文章, 需要翻页 {total_page} 次, 请稍后 ...")
# 当前页为0时必检查下一页..
for i in range(0, len(mask)):
if not page_limit:
break
if mask[i] == '1':
continue
print(f"正在处理第{i}页...")
time.sleep(TIME_SLEEP)
rep = requests.get(Urls.query_arti.format(token=Session.token, fakeid=fakeid, begin=i * pagesize, count=pagesize, query=query), cookies=Session.cookies, headers=Session.headers)
artis = ArtisResp(rep.text)
if artis.ret :
break
flag = True
for it in artis.list:
link = it['link']
if link in data:
continue
flag = False
data[link] = it
mask[i] = '1'
# force check next page.
if not flag and i < len(mask) - 1:
mask[i + 1] = '0'
#count check limit
page_limit -= 1
else:
print(f"调用搜索, 报错:{artis.ret} {artis.err_msg}")
curr_searched = sum(map(lambda x: 1 if x == '1' else 0, mask))
# if not total:
# raise Exception('搜索不到文章, 或者接口被反爬, 请删除cookies.json文件 等几分钟再试, 或换个账号试试.')
print(f"本次搜索到:{total_page} 页文章, 已处理:{curr_searched}页, 共在 todo.list 中包含 {len(data)} 条文章链接 ...")
todo['__total_cnt'] = total
todo['__mask'] = ''.join(mask)
save_todo_list(Input.fake_name, todo)
cnt = 0
for url, arti_info in data.items():
if url in Input.arti_cache:
continue
print(f"{arti_info['title']} --> {url}")
if pipe_crawl_articles(arti_info):
cnt += 1
append_arti_cache(url)
print(f" 本次共处理了 {cnt} 条文章链接!")
def verfy_arti_content(html):
if not html:
return False, "从服务器获取失败"
pat = re.compile(r'<div class="page_msg')
if not pat.search(html):
return True, ""
pat = re.compile(r'<div class="global_error_msg.*?">(.*?)</div', re.MULTILINE| re.DOTALL)
ms = pat.findall(html)
if ms:
return False, ms[0].strip()
return False, "服务器返回未知错误"
def crawl_all_images(url, sdir, url_cache, html=None):
pat = re.compile(r'src="(https://.*?)"')
pat2 = re.compile(r'wx_fmt=(.*)')
urls = []
try:
if not html:
rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)
html = rep.text
mats = pat.findall(html, pos=0)
idx = 0
for m in mats:
if m in url_cache:
continue
pps = pat2.findall(m)
if pps:
postfix = pps[0]
else:
postfix = 'jpg'
download(m, os.path.join(sdir, f"{idx}.{postfix}"))
urls.append(m)
idx += 1
append_url_cache(urls)
return True
except:
print(f"failed crawl images from url:{url}")
sg = traceback.format_exc()
print(sg)
return False
def crawl_baidu_pan_link(url, sdir, url_cache):
pat = re.compile(r'链接\s*[:|:]\s*(https://pan\.baidu\.com/.*?)提取码\s*[:|:]\s*(....)')
try:
urls = []
rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)
html = rep.text
mats = pat.findall(html, pos=0)
if not mats:
return False
with open("baidu.pan.links.txt", "a") as myfile:
for uus in mats:
uu = uus[0]
if not uu or uu in Input.url_cache:
continue
pwd = uus[1]
myfile.write(f"{uu} => {pwd}\n")
Input.url_cache[uu] = True
urls.append(uu)
append_url_cache(urls)
return True
except:
print(f"failed crawl linkss from url:{url}")
sg = traceback.format_exc()
print(sg)
return False
def crawl_whole_page(url, sdir, url_cache):
try:
rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)
if rep.status_code != 200:
return False
html = rep.text
valid, msg = verfy_arti_content(html)
if not valid:
raise Exception(f"保存网页失败: {msg}")
os.makedirs(sdir, exist_ok=True)
with codecs.open(os.path.join(sdir, 'index.html'), "w", 'utf-8') as f:
f.write(html)
f.flush()
return crawl_all_images(url, sdir, Input.url_cache, html=html)
except:
print(f"failed crawl page from url:{url}")
sg = traceback.format_exc()
print(sg)
return False
def crawl_by_custom_pipe(url, sdir, url_cache):
if not Input.custom_pipe:
sps = (Input.args.pipe if Input.args.pipe else '').split(',')
for sp in sps:
Input.custom_pipe.append(__import__(sp.strip()))
for p in Input.custom_pipe:
urls = p.crawl(url, sdir)
for url in urls:
url_cache[url] = True
return not not urls
return False
def pipe_crawl_articles(arti_info):
title_4_dir = arti_info['title'].replace(':', '').replace(' ', '').replace(':', '').replace('/', '').replace('|', '').replace('<', '').replace('>', '').replace('?', '').replace('"', '')
sdir = os.path.join(Input.out_dir, Input.fake_name, title_4_dir)
if not os.path.exists(sdir):
os.makedirs(sdir, exist_ok=True)
if Input.crawl_method == 'all_images':
return crawl_all_images(arti_info['link'], sdir, Input.url_cache)
elif Input.crawl_method == 'baidu_pan_links':
return crawl_baidu_pan_link(arti_info['link'], sdir, Input.url_cache)
elif Input.crawl_method == 'whole_page':
return crawl_whole_page(arti_info['link'], sdir, Input.url_cache)
elif Input.crawl_method == 'pipe':
return crawl_by_custom_pipe(arti_info['link'], sdir, Input.url_cache)
def pipe():
'''query fakes '''
fake_info = pipe_fakes(Input.fake_name)
if not fake_info:
raise Exception(f"Can not query fakes with input:{Input.fake_name}")
'''query arti'''
fakeid = fake_info['fakeid']
pipe_articles(fakeid)
input("pipe contiune:")
def process_input():
Input.artis_cache = {}
ac = os.path.join('arti.cache.list')
if os.path.isfile(ac):
with open(ac, 'rt') as fi:
line = fi.readline()
while line:
Input.arti_cache[line.strip()] = True
line = fi.readline()
uc = os.path.join('url.cache.list')
if os.path.isfile(uc):
with open(uc, 'rt') as fi:
line = fi.readline()
while line:
Input.url_cache[line.strip()] = True
line = fi.readline()
def append_arti_cache(arti_link):
arti_link = arti_link.strip()
if not arti_link:
return
ac = os.path.join('arti.cache.list')
with open(ac, "a") as myfile:
myfile.write(f"{arti_link}\n")
Input.arti_cache[arti_link] = True
def append_url_cache(urls):
ac = os.path.join('url.cache.list')
with open(ac, "a") as myfile:
for url in urls:
url = url.strip()
if not url:
continue
myfile.write(f"{url}\n")
Input.url_cache[url] = True
def load_todo_list(key):
fn = os.path.join('output', key, "todo.list")
if os.path.isfile(fn):
with open(fn, 'rb') as fi:
return json.load(fi)
return {}
def save_todo_list(key, dic):
if not dic:
return
fn = os.path.join('output', key, "todo.list")
os.makedirs(os.path.dirname(fn), exist_ok=True)
open(fn, 'wb').write(json.dumps(dic).encode('utf-8'))
def main(chrome):
#会过期, 重新登录后需要重新取得
if not chrome:
if os.path.isfile('chromedriver'):
chrome = 'chromedriver'
else:
chrome = input('输入webchrome:').strip()
driver = webdriver.Chrome(executable_path=chrome)
cookies = json.load(open('cookies.json', 'rb')) if os.path.isfile('cookies.json') else []
driver.get(Urls.index)
if not cookies:
input("请先手动登录, 完成后按回车继续:")
cookies = driver.get_cookies()
open('cookies.json', 'wb').write(json.dumps(cookies).encode('utf-8'))
set_cookies(driver, cookies)
driver.get(Urls.index)
url = driver.current_url
if 'token' not in url:
raise Exception(f"获取网页失败!")
Session.token = re.findall(r'token=(\w+)', url)[0]
process_input()
pipe()
# def test():
# Input.fake_name = '大J小D'
# Input.crawl_method = 'baidu_pan_links'
# main(None)
if __name__ == '__main__':
# test()
description = u"公众号文章全搞定"
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-biz', dest='biz', type=str, help='必填:公众号名字', required=True)
parser.add_argument('-chrome', dest='chrome', type=str, help='可选:web chrome 路径, 默认使用脚本同级目录下的chromedriver')
parser.add_argument('-arti', dest='arti', type=str, help='可选:文章名字, 默认处理全部文章')
parser.add_argument('-method', dest='method', type=str, help='可选, 处理方法: all_images, baidu_pan_links, whole_page')
parser.add_argument('-sleep', dest='sleep', type=str, help='翻页休眠时间, 默认为1即 1秒每页.')
parser.add_argument('-pipe', dest='pipe', type=str, help='在method指定为pipe时, 该参数指定pipe处理流程. 例如:"pipe_example, pipe_example1, pipe_example2, pipe_example3"')
parser.add_argument('-pl', dest='page_limit', type=str, help='指定最大翻页次数, 每次同一个公众号, 翻页太多次会被ban, 0:不翻页 只处理todo.list, 默认<0:无限制 >0:翻页次数')
Input.args = parser.parse_args()
Input.fake_name = Input.args.biz
Input.crawl_method = Input.args.method if Input.args.method else 'all_images'
Input.page_sleep = int(Input.args.sleep) if Input.args.sleep else 1
Input.page_limit = int(Input.args.page_limit) if Input.args.page_limit else -1
main(Input.args.chrome)