/
text.py
136 lines (112 loc) · 4.03 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
from typing import Set, Tuple
from urllib.parse import urlparse
import bleach
from bleach import callbacks
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
def decode_if_bytes(text):
try:
return text.decode("utf-8")
except AttributeError:
return text
def encode_if_text(text):
try:
return bytes(text, encoding="utf-8")
except TypeError:
return text
def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
"""Find tags in text.
Tries to ignore tags inside code blocks.
Optionally, if passed a "replacer", will also replace the tag word with the result
of the replacer function called with the tag word.
Returns a set of tags and the original or replaced text.
"""
found_tags = set()
lines = text.splitlines(keepends=True)
final_lines = []
code_block = False
final_text = None
# Check each line separately
for line in lines:
final_words = []
if line[0:3] == "```":
code_block = not code_block
if line.find("#") == -1 or line[0:4] == " " or code_block:
# Just add the whole line
final_lines.append(line)
continue
# Check each word separately
words = line.split(" ")
for word in words:
if word.find('#') > -1:
candidate = word.strip().strip("([]),.!?:*_%/")
if candidate.find('<') > -1 or candidate.find('>') > -1:
# Strip html
candidate = bleach.clean(word, strip=True)
# Now split with slashes
candidates = candidate.split("/")
to_replace = []
for candidate in candidates:
if candidate.startswith("#"):
candidate = candidate.strip("#")
if test_tag(candidate.lower()):
found_tags.add(candidate.lower())
to_replace.append(candidate)
if replacer:
tag_word = word
try:
for counter, replacee in enumerate(to_replace, 1):
tag_word = tag_word.replace("#%s" % replacee, replacer(replacee))
except Exception:
pass
final_words.append(tag_word)
else:
final_words.append(word)
else:
final_words.append(word)
final_lines.append(" ".join(final_words))
if replacer:
final_text = "".join(final_lines)
return found_tags, final_text or text
def get_path_from_url(url: str) -> str:
"""
Return only the path part of an URL.
"""
parsed = urlparse(url)
return parsed.path
def process_text_links(text):
"""Process links in text, adding some attributes and linkifying textual links."""
link_callbacks = [callbacks.nofollow, callbacks.target_blank]
def link_attributes(attrs, new=False):
"""Run standard callbacks except for internal links."""
href_key = (None, "href")
if attrs.get(href_key).startswith("/"):
return attrs
# Run the standard callbacks
for callback in link_callbacks:
attrs = callback(attrs, new)
return attrs
return bleach.linkify(
text,
callbacks=[link_attributes],
parse_email=False,
skip_tags=["code"],
)
def test_tag(tag: str) -> bool:
"""Test a word whether it could be accepted as a tag."""
if not tag:
return False
for char in ILLEGAL_TAG_CHARS:
if char in tag:
return False
return True
def validate_handle(handle):
"""
Very basic handle validation as per
https://diaspora.github.io/diaspora_federation/federation/types.html#diaspora-id
"""
return re.match(r"[a-z0-9\-_.]+@[^@/]+\.[^@/]+", handle, flags=re.IGNORECASE) is not None
def with_slash(url):
if url.endswith('/'):
return url
return f"{url}/"