-
Notifications
You must be signed in to change notification settings - Fork 0
/
isEnglish
41 lines (35 loc) · 2.69 KB
/
isEnglish
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
English_3s = {'age', 'air', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'bet', 'bid', 'big', 'box', 'but', 'by', 'can', 'cut', 'day', 'did', 'do', 'end', 'eye', 'fan', 'few', 'fix', 'for', 'get', 'go', 'got', 'had', 'has', 'he', 'her', 'his', 'how', 'if', 'in', 'is', 'it', 'its', 'job', 'key', 'led', 'lot', 'may', 'net', 'new', 'no', 'not', 'now', 'of', 'off', 'oil', 'on', 'one', 'or', 'our', 'out', 'own', 'par', 'pay', 'per', 'plc', 'run', 'say', 'see', 'set', 'so', 'tax', 'the', 'to', 'too', 'top', 'try', 'two', 'up', 'us', 'use', 'vs', 'was', 'way', 'we', 'who', 'you'}
def isEnglish(scripture):
# Let's start with something easy, Unicode. If your talking in a language that Unicode sees as expensive,
# your not talkin' English. For example, talking about American dollars is cheap as ord('$') is only 36.
# Though, if your talking about Euros, that's expensive because ord('€') is 8,364. Similarly, is pretty
# expensive to talk Asian languages, ord('字') = 23383. So, if the sentence is expensive, return False,
# because it ain't English.
component_values = [ord(x) for x in list(scripture)]
average_unicode_value = sum(component_values)/len(component_values)
if average_unicode_value > 500.0:
return False
# If that didn't work, we can also try to look at the small words in the sentences. To do that, let's
# make a new 'set' from the scripture and start to count up the small words.
# As a side, remove 'a' from list ---> Why? It's in lot's of languages, try removing it from equation
words = set(scripture.strip().split())
words = words - {'a'}
small_words = 0
# Small words can be the crux of languages. Luckily, us English speakers capitalize lots of
# words that can be omitted from the worry.
# For instance, 'Dan the Man kicked the can with his friends Ann and Stan."
# We've got 8 'small' words, only half of which would qualify for adding to small_words consideration.
for word in words:
if len(word) < 4 and word.isalpha() and word.islower():
small_words+=1
# Check to see if the small words are stuff like 'ohh la la' or 'hen da dui', which would
# obviously be French or Chinese, but not English! For Gods' sake, not English at all.
# Well, that would be the case at least if there are a bunch of those words, and most of
# them are not in my set of English_3s. Otherwise, it's English! Not the Queen's, just regular English.
small_words_English = English_3s.intersection(words)
if small_words > 1 and len(small_words_English) / small_words < 0.52:
return False
else:
return True
this = isEnglish("Your scripture.")
print("That is {} scripture.".format(this))