/
horrifying_hacks.py
101 lines (89 loc) · 1.74 KB
/
horrifying_hacks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
ENGLISH_LETTER_PRONOUNCIATIONS = {
"a": "eh",
"b": "bee",
"c": "see",
"d": "dee",
"e": "ee",
"f": "eff",
"g": "jee",
"h": "ehch",
"i": "eye",
"j": "jay",
"k": "kay",
"l": "el",
"m": "em",
"n": "en",
"o": "oh",
"p": "pee",
"q": "cue",
"r": "are",
"s": "ess",
"t": "tee",
"u": "you",
"v": "vee",
"w": "double you",
"x": "ecks",
"y": "why",
"z": "zee",
"1": "one",
"2": "two",
"3": "three",
"4": "four",
}
def _acronym(acronym):
letters = [ENGLISH_LETTER_PRONOUNCIATIONS[lt] for lt in acronym.lower()]
return " ".join(letters)
MISPRONOUNCED_TOKENS = {
"chatgpt": "Chat jee pee tee",
"openai": "open eh eye",
"strachan": "strohn",
"emacs": "eemacs",
"nodejs": "node jay ess",
"filename": "file name",
"openjdk": "open jay dee kay",
"xu": "shoo",
"cfar": "see far",
}
ACRONYMS = {
"gpt",
"ai",
"api",
"tts",
"ssh",
"http",
"http",
"url",
"amd",
"cpu",
"tldr",
"lts",
"ip",
"html",
"mp3",
"mp4",
"ogg",
"ogv",
"ssl",
"ml",
"sdk",
"cljs",
"ui",
}
UNICODE = {"🤗": "hugging face", "🦄": "unicorn"}
RE = re.compile(
r'(?:\b(?:%s|{u"\U0001F600-\U0001F64F"})\b)|(?:%s)'
% ("|".join(ACRONYMS.union(MISPRONOUNCED_TOKENS.keys())), "|".join(UNICODE.keys())),
flags=re.IGNORECASE | re.UNICODE,
)
def _replace(m):
low = m.group(0).lower()
if low in ACRONYMS:
return _acronym(low)
if low in MISPRONOUNCED_TOKENS:
return MISPRONOUNCED_TOKENS[low]
if low in UNICODE:
return UNICODE[low]
return m.group(0)
def apply(string):
return re.subn(RE, _replace, string)[0]