-
Notifications
You must be signed in to change notification settings - Fork 0
/
postagger.py
executable file
·111 lines (99 loc) · 3.62 KB
/
postagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!python3
import lemmaV
import os
import nltk
"""
PRS: Present,
FUT: Future,
PRG: Progressive,
IMF: Immediate Future,
PST: Past,
PRP: Present Perfect
"""
simpleTag = {} # provide a mapping from the detailed parts of speech in English to smaller, simpler set
for vb in ['VB', 'VBP', 'VBZ', 'VBD', 'VBN', 'VBG', 'MD']:
simpleTag[vb] = "VERB"
for nn in ['NN', 'NNS', 'NNP', 'NNPS']:
simpleTag[nn] = "NOUN"
for prp in ['PRP', 'PRP$', 'WP', 'WP$']:
simpleTag[prp] = "PRNN"
for jj in ['JJ', 'JJR', 'JJS']:
simpleTag[jj] = "ADJ"
for rb in ['RB', 'RBR', 'RBS', 'WRB', "RP"]:
simpleTag[rb] = "ADV"
simpleTag["CD"] = "NUM"
simpleTag["CC"] = "CONJ"
simpleTag["UH"] = "INTR"
for dt in ['DT', 'WDT']:
simpleTag[dt] = "DET"
for to in ['TO', 'IN']:
simpleTag[to] = "PREP"
def tag(word: str) -> list[str]:
""" tagging for the purpose of sending a list of possibilities to the evaluator """
possibilities = []
lemmas = lemmaV.lemmatizeVerb(word)
# print(lemmas)
for i in lemmas: # list of possible tenses if a verb
translation = i.get("English", i['Root'])
tags = tagPCNLP(translation)
# tags = tagPNLTK(translation)
# tags = tagJCNLP(translation)
potentialTags = [simpleTag.get(tag, "UNK") for tag in tags]
if not potentialTags:
potentialTags = ["UNK"]
# if tense == prs, it is either an actual present verb or different part of speech entirely
# some inflection is performed, likely a verb
if i["Tense"] != "PRS" and \
get_highest_ranked(potentialTags) != 'VERB': # if modified but not a verb
continue # ignore, false lemmatization
# print(word, "({})".format(i["Root"]), potentialTags, i["English"])
possibilities += potentialTags
return possibilities if possibilities else ["UNK"]
def get_highest_ranked(listOfParts:list[str]) -> str:
"""Given an list of parts of speech , return the 'strongest'"""
if "VERB" in listOfParts:
return "VERB"
if "NOUN" in listOfParts:
return "NOUN"
if "PRNN" in listOfParts:
return "PRNN"
if "ADV" in listOfParts:
return "ADV"
if "ADJ" in listOfParts:
return "ADJ"
if "NUM" in listOfParts:
return "NUM"
if "DET" in listOfParts:
return "DET"
if "CONJ" in listOfParts:
return "CONJ"
if "PREP" in listOfParts:
return "PREP"
if "INTR" in listOfParts:
return "INTR"
else:
print(listOfParts)
return "UNK"
pos_tagger = nltk.parse.CoreNLPParser(url='http://localhost:9000', tagtype='pos')
def tagPCNLP(words: str):
""" Use the Stanford CoreNLPServer"""
return [x[1] for x in pos_tagger.tag(nltk.word_tokenize(words))]
def tagPNLTK(words: str):
""" Use the NLTK POS Tagger """
return [x[1] for x in nltk.pos_tag(words.split())]
def tagJCNLP(words: str):
""" Create a file as input for and execute the Stanford Part of Speech Tagger.
Return an exit code of false if something fails, as well as the output of the tagger"""
with open('in.tag', 'w') as file:
file.write(words)
exitCode = os.system("cd stanford-postagger-full-2020-11-17; \
./stanford-postagger.sh models/english-left3words-distsim.tagger ../in.tag > ../out.tag 2>/dev/null")
if exitCode != 0:
return []
with open("out.tag") as file:
output = file.read().split()
return [word[word.rindex("_")+1:].strip() for word in output]
if __name__ == '__main__':
# print(tag("Enjoying good food on the holidays, especially with family, is the best feeling to have."))
while 1:
print(tag(input()))