-
Notifications
You must be signed in to change notification settings - Fork 7
/
newsparser.py
120 lines (102 loc) · 3.85 KB
/
newsparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import html
from html.parser import HTMLParser
import os
import datetime
import traceback
import re
def isPlusOne(msg):
return msg.startswith("+1") and len(msg) < 10
class articleParser(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.fed = []
def handle_starttag(self, tag, attrs):
if tag != "img":
return
for attr in attrs:
if attr[0] != "src":
continue
self.fed.append(attr[1])
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return html.escape(''.join(self.fed))
class newsArticle:
def __init__(self, author, topic, subject, date, raw_msg, raw_html, mention_manager):
self.author_username = author[0]
self.author_displayname = author[1]
self.topic = topic
self.subject = subject
self.date = date
self.raw_msg = raw_msg
self.raw_html = raw_html
self.mention_manager = mention_manager
self.broken = False
self.content = None
self.is_plus_one = None
def isPlusOne(self):
if self.is_plus_one is None:
self.is_plus_one = isPlusOne(self.raw_msg)
return self.is_plus_one
def getAsHtml(self):
if self.content is None:
self.parseMessage()
return self.content
# TODO: getAttachemnts:
# Extract the images and files in the message
def makeHeader(self):
hdr = f"From: {self.author_username}({self.author_displayname})\r\n"\
f"Newsgroup: {self.topic}\r\n"\
f"Subject: {self.subject}\r\n"\
f"Date: {self.date}\r\n"\
f"is_plus_one: {self.isPlusOne()}"
return "<code>"+html.escape(hdr)+"</code>\n\n"
def parseLinks(self, msg):
# In markdown, discourse uses the following format for images and image links
# ![image_name](upload://<file_name_hash>)
# However, in the html version of the message image links are regular http(s)
# links. This part of the code removes the upload:// urls from the markdown and
# replaces them with the image links from the html version.
regex = r"!\[([^\]]*)\]\(([^\)]*)\)"
img_tag = '<img src=\"'
pattern = re.compile(regex)
parsed = ""
link_counter = 1
last_pos = 0
for m in pattern.finditer(msg):
# Find link name
name_start, name_end = m.span(1)
link_name = msg[name_start:name_end]
if link_name == "":
link_name = f"Link {link_counter}"
# Find first img link in html
html_link_start = self.raw_html.find(img_tag)+len(img_tag)
html_link_len = self.raw_html[html_link_start:].find('"')
html_link = self.raw_html[html_link_start:html_link_start+html_link_len]
# Discard used html
self.raw_html = self.raw_html[html_link_start+html_link_len:]
# Replace link in markdown
match_start, match_end = m.span()
parsed += msg[last_pos:match_start] + f"[{link_name}]({html_link})"
last_pos = match_end
link_counter+=1
parsed += msg[last_pos:]
return parsed
def parseMessage(self):
if self.broken:
return
try:
hdr = self.makeHeader()
p = articleParser()
p.feed(self.raw_html)
content = p.get_data()
# TODO: Parse Markup
# content = self.parseLinks(content)
# content = ''.join(['\\'+c for c in content if ord(c) > 0 and ord(c) < 128])
self.mention_manager.parseMentions(content, self.topic)
self.content = hdr + content
except Exception as e:
print(e, datetime.datetime.now())
traceback.print_exc()
self.broken = True