-
Notifications
You must be signed in to change notification settings - Fork 3
/
98push.py
458 lines (409 loc) · 17.1 KB
/
98push.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
import re
import datetime
import json
import os
import random
import sqlite3
from bs4 import BeautifulSoup
import pymysql,psycopg2,itertools
import requests
from lxml import etree
from playwright.sync_api import sync_playwright
import time
# 获取综合区等的内容
def get_content(page, web_url, url_type=1):
try:
page.goto(web_url)
s = page.content()
soup = BeautifulSoup(s, 'lxml')
if url_type == 2:
hostloc_content = soup.find_all("div", class_="pcb")[0]
else:
hostloc_content = soup.find_all("td", class_="t_f")[0]
for tip in hostloc_content.find_all("div", class_="tip_4"):
tip.decompose()
img_list = hostloc_content.find_all("img", class_="zoom")
# 遍历页面中的所有图片元素,将图片替换为 Markdown 格式
for img in img_list:
# 获取图片的 URL 地址
if 'zoomfile' in img.attrs:
img_url = img['zoomfile']
elif 'file' in img.attrs:
img_url = img['file']
elif 'src' in img.attrs:
img_url = img['src']
else:
img_url = ""
# 将 img 元素替换为 Markdown 的图片语法
markdown_img = "[图片](" + img_url + ")"
img.replace_with(markdown_img)
# 遍历页面中所有的附件元素
# for link in hostloc_content.find_all('a'):
# # print(link.get_text())
# if link.name == 'a' and (link.get_text().endswith('.rar') or link.get_text().endswith('.7z') or link.get_text().endswith('.zip')):
# # 获取附件的 URL 地址
# link_url = mianfan_url + "/"+ link['href']
# # 将 a 元素替换为 Markdown 的附件语法
# markdown_link = "[" + link.get_text() + "](" + link_url + ")"
# # 将 a 元素替换为 Markdown 的附件语法
# link.replace_with(markdown_link)
for em in hostloc_content.find_all("em", class_="xg1"):
em.decompose()
contests = hostloc_content.text
contests = contests.replace("\r\n", '').replace('\n', '').replace('\xa0', '').replace('\u200b', '')
findal = re.findall(r'本帖最后由.*编辑', contests)
if findal:
contests = contests.replace(findal[0], "").lstrip()
pattern = r"\[图片\]\(.*?\)"
# 用一个占位符替换[图片](url)
placeholder = "#图"
# 用一个列表来存储所有的[图片](url)
images = re.findall(pattern, contests)
# 用一个循环来替换所有的[图片](url)
contests2 = contests
for image in images:
contests = contests.replace(image, placeholder, 1)
contests3 = contests[0:100]
if contests3.endswith("#"):
contests3 = contests3 + "图"
contests3 = mark_down(contests3)
contests3 = contests3.replace("""\#图""", "#图 ")
for image in images:
contests3 = contests3.replace(placeholder, image, 1)
return contests3, contests2
except Exception as e:
print(e)
print("网络原因,无法访问,请稍后再试...")
return "因权限或网络等原因,内容无法预览,请手动登陆查看!", "因权限或网络等原因,内容无法预览,请手动登陆查看!"
def get_content(page, web_url, url_type=1):
try:
page.goto(web_url)
s = page.content()
soup = BeautifulSoup(s, 'lxml')
hostloc_content = soup.find_all("div", class_="pcb")[0] if url_type == 2 else soup.find_all("td", class_="t_f")[0]
for tip in hostloc_content.find_all("div", class_="tip_4"):
tip.decompose()
img_list = hostloc_content.find_all("img", class_="zoom")
for img in img_list:
img_url = img.attrs.get('zoomfile') or img.attrs.get('file') or img.attrs.get('src') or ""
markdown_img = f"[图片]({img_url})"
img.replace_with(markdown_img)
for em in hostloc_content.find_all("em", class_="xg1"):
em.decompose()
contests = re.sub(r'本帖最后由.*编辑', '', hostloc_content.text)
contests = re.sub(r'[\r\n\xa0\u200b]+', '', contests)
pattern = r"\[图片\]\(.*?\)"
placeholder = "#图"
images = re.findall(pattern, contests)
for image in images:
contests = contests.replace(image, placeholder, 1)
contests3 = mark_down(contests[:100].rstrip() + "图", ["#"])
contests3 = contests3.replace(f"{placeholder} ", "").replace(placeholder, images.pop(0), 1) if images else contests3
for image in images:
contests3 = contests3.replace(placeholder, image, 1)
return contests3, contests
except Exception as e:
print(e)
return "因权限或网络等原因,内容无法预览,请手动登陆查看!", "因权限或网络等原因,内容无法预览,请手动登陆查看!"
# ai区和综合区格式有差异
def mark_down(content):
# 删除特殊符号,防止发生错误parse
sign = ['&', '<', ".", '>', '?', '#', '%', '!', '@', '$', '^', '*', '(', ')', '-', '_', '+', '=',
'~', '/', ',', ':', '’', '‘', '^', '{', '}', '[', ']', '`', "'", "|"]
content = content.replace("\n", "")
content = content.strip()
for k in sign:
content = content.replace(k, "\\" + k)
return content
def mark_down2(content):
content = content.strip().replace("\n", "").replace('"', "'")
return content
# 是否登陆,在某些网站xpaths会发生变化
# 如果和我用的网站不同,请根据实际情况自行修改
def master(r, page, xpaths, url_type=1,tietype="综合区"):
global mianfan_url, mianfan_url2
xml_content = etree.HTML(r)
posts = xml_content.xpath('/html/body/div[{}]/div[6]/div/div/div[4]/div[2]/form/table/tbody/tr'.format(xpaths))
for post in posts:
link = post.xpath('./th/a[2]')
if not link:
continue
href_list = link[0].xpath('@href')
if not href_list:
continue
href_id = href_list[0].split("tid=", )[-1].split("&", )[0]
if not re.match(r'^\d+$', href_id):
continue
name = link[0].xpath('string()').replace("\r\n", "")
if name == "隐藏置顶帖":
continue
author = post.xpath('./td[2]/cite/a')
author_url = author[0].xpath('@href')
uid = author_url[0].split("=")[-1]
url_author = url_1 + "{}".format(author_url[0])
url_list = url_1 + "thread-{}-1-1.html".format(str(href_id))
content_2, content_3 = get_content(page, url_list, url_type)
mian_url = url_list.replace("https://www.sehuatang.net", mianfan_url)
mian_url2 = url_list.replace("https://www.sehuatang.net", mianfan_url2)
text = '[ 主题 ]:***{}***\n[{}] [{}]({})\n[ 地址 ]:[{}]({}) [免翻地址]({}) [免翻地址2]({})\n[ 内容 ]:[ {} ]'.format(
mark_down(name), mark_down("#U" + uid), mark_down(author[0].xpath('string()')), url_author, str(href_id),
url_list, mian_url, mian_url2, content_2)
try:
insert_db2(mark_down2(author[0].xpath('string()')), url_list, mark_down2(name), mark_down2(content_3),
href_id)
except:
pass
# post either to pid or pid2, depending on url_type
post(pid2 if url_type == 2 else pid, text)
time.sleep(random.randint(5, 8))
# 发送到tg
def post(chat_id: str, text: str, silent: bool = False, num=0):
try:
with requests.post(
url=f'https://api.telegram.org/bot{bottoken}/sendMessage',
headers={
'Content-Type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
data=json.dumps({
'chat_id': chat_id,
'text': text,
'parse_mode': 'MarkdownV2',
'disable_notification': silent,
'disable_web_page_preview': True,
})) as r:
r.raise_for_status()
return r.json()
except Exception as e:
print(e)
print(text)
print("推送失败!")
if num > 10:
return
else:
num += 1
time.sleep(3)
post(chat_id, text, num=num)
# 定时更新
t1 = 0
# 从json文件获取免翻
def getmian(page):
global t1
if not os.path.exists("./mianfan.json") or (t1 != 0 and time.time() - t1 >= 86400):
file = open('./mianfan.json', 'w')
mianfans = {}
mianfans["t1"] = time.time()
mianfans["mian_url1"], mianfans["mian_url2"] = get_mianfan(page)
file.write(json.dumps(mianfans))
file.close()
else:
f = open("./mianfan.json", encoding="utf-8")
res = f.read()
f.close()
mianfans = json.loads(res)
t1 = mianfans["t1"]
return mianfans["mian_url1"], mianfans["mian_url2"]
# 由于某些原因,改为固定
def get_mianfan(page, mian_num=0):
mian_url1 = "https://wpzo.app"
mian_url2 = "https://xj4sds.com"
return mian_url1,mian_url2
try:
link = "https://nux4n.cn/config.js" # 这个目前好像不安全
page.goto(link)
pattern = r"home_url\s*=\s*'([^']+)'"
match = re.search(pattern, page.content())
if match:
mian_url1 = match.group(1)
mian_url1 = mian_url1.rstrip('/')
else:
mian_url1 = "https://wpzo.app"
except Exception as e:
print(e)
if mian_num > 10:
mian_url1 = "https://wpzo.app"
mian_url2 = "https://xj4sds.com"
return mian_url1, mian_url2
else:
mian_num += 1
time.sleep(3)
get_mianfan(page, mian_num)
return mian_url1, mian_url2
# 获取配置
def get_con():
# Check if the config file exists
if not os.path.exists("./98.json"):
print("缺少配置文件")
exit()
# Read and parse the config file
with open("./98.json", encoding="utf-8") as f:
config = json.load(f)
# Check if required config values are present
if config["bottoken"] == "机器人token" or config["pid"] == "-100后面跟你的频道id":
print("请填写配置文件")
exit()
# Sanitize and validate config values
config["times"] = int(config.get("times", 20))
config["timed"] = int(config.get("timed", 40))
if config["times"] >= config["timed"]:
config["times"] = 20
config["timed"] = 40
# Return relevant config values
return str(config["bottoken"]), str(config["pid"]), str(config["pid2"]), config["times"], config["timed"], str(config["my_usename"]), str(config["my_pass"])
# 新增存到sqlite3
def get_db3():
if not os.path.exists("sehua2.db"):
con = sqlite3.connect("sehua2.db")
cur = con.cursor()
sql = """CREATE TABLE IF NOT EXISTS `sehua_new` (
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
`uname` varchar(255) NULL DEFAULT NULL,
`surl` varchar(255) NULL DEFAULT NULL,
`title` varchar(255) NULL DEFAULT NULL,
`tid` varchar(255) NULL DEFAULT NULL,
`cont` text NULL,
`create_at` varchar(255) NULL DEFAULT NULL
)"""
cur.execute(sql)
cur.close()
con.close()
con2 = sqlite3.connect("sehua2.db")
return con2
def insert_db2(uname, surl, title, cont, tid):
create_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
insert_sql = 'INSERT INTO sehua_new (uname, surl, title, cont, create_at, tid) VALUES (%s, %s, %s, %s, %s, %s)'
execute_insert(insert_sql, (uname, surl, title, cont, create_at, tid), '数据添加成功')
def insert_db(uname, surl, title, cont, tietype, tid):
create_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
insert_sql = 'INSERT INTO omega_sehua_new (uname, surl, title, cont, create_at, tietype, tid) VALUES (%s, %s, %s, %s, %s, %s, %s)'
execute_insert(insert_sql, (uname, surl, title, cont, create_at, tietype, tid), '数据添加成功')
def execute_insert(insert_sql, args, success_message):
db = get_db3()
cursor = db.cursor()
cursor.execute(insert_sql, args)
db.commit()
cursor.close()
db.close()
print(success_message)
def get_isset(tid):
select_from_toutiao_sql = 'SELECT tid FROM sehua_new WHERE tid = %s'
results = execute_select(select_from_toutiao_sql, (tid,))
if results:
return '123'
else:
return '456'
def execute_select(select_sql, args):
db = get_db3()
cursor = db.cursor()
cursor.execute(select_sql, args)
results = cursor.fetchall()
cursor.close()
db.close()
return results
def insert_db2(uname, surl, title, cont,tid):
db = get_db3()
cursor = db.cursor()
create_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
insert_sql = """insert into sehua_new(uname, surl, title, cont, create_at,tid) VALUES ("{0}","{1}","{2}","{3}","{4}","{5}")""".format(
uname, surl, title, cont, create_at,tid)
cursor.execute(insert_sql)
db.commit()
cursor.close()
db.close()
print("数据添加成功")
def get_isset(tid):
db1 = get_db3()
cursor1 = db1.cursor()
select_from_toutiao_sql = """select tid from sehua_new where tid = "{0}" """.format(tid)
cursor1.execute(select_from_toutiao_sql)
ras = cursor1.fetchall()
cursor1.close()
db1.close()
# 更新表中使用不了的数据
if len(ras) > 0: # 表中已经存在
return '123'
else:
return "456"
# 全局配置
# 分别为机器人token,新帖频道,ai新帖频道,最短更新时间,最长更新时间,账号,密码
bottoken, pid, pid2, times, timed, my_usename, my_pass = get_con()
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
# 可以改为其他镜像网站
# url_1 = "https://1uc82.com/"
url_1 = "https://www.sehuatang.net/"
mianfan_url, mianfan_url2 = "", ""
def main():
global mianfan_url
global mianfan_url2
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
# Open new page
page = context.new_page()
mianfan_url, mianfan_url2 = getmian(page)
time.sleep(3)
page.goto(url_1)
time.sleep(5)
# 网站的验证
page.click('xpath=/html/body/a[1]')
time.sleep(5)
xpaths = "6"
if my_usename != "" and my_pass != "":
print("登陆")
page.fill('xpath=//*[@id="ls_username"]',my_usename)
page.fill('xpath=//*[@id="ls_password"]', my_pass)
time.sleep(5)
page.click('xpath=//*[@id="lsform"]/div/div/table/tbody/tr[2]/td[3]/button')
time.sleep(5)
xpaths = "7"
# form_type = 3
qulist = ["95","141","142","166","97"] ## 综合区 95 转帖 142 AI 166 原创141 资源出售区 97
## 用以无限循环
qulist_cycle = itertools.cycle(qulist)
while True:
try:
result = next(qulist_cycle)
url_type = 1
if result == "95":
tietype = "综合区"
elif result == "141":
tietype = "原创区"
elif result == "166":
tietype = "AI区"
url_type = 2
elif result == "97":
tietype = "资源出售区"
url_type = 3
else:
tietype = "转帖交流区"
url_type = 3
print(tietype)
## 改为获取前两页,2改为其他数字就是前x页,注意不要大于1000
for i in range(2, 0, -1):
print("当前页码为"+str(i))
url_sehua = url_1 + "forum.php?mod=forumdisplay&fid="+str(result)+"&filter=author&orderby=dateline&page="+str(i)
print("js验证")
# 网址
page.goto(url_sehua)
master(page.content(), page, xpaths, url_type,tietype)
time.sleep(random.randint(times, timed))
# 关闭多余标签页
print("关闭多余标签页")
for page1 in context.pages:
if page1 != page:
page1.close()
except Exception as e:
print(e)
print("网络错误,请稍后重试")
time.sleep(random.randint(60, 90))
if __name__ == "__main__":
main()