This repository has been archived by the owner on Nov 11, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Lemmatizer.py
60 lines (54 loc) · 1.82 KB
/
Lemmatizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
'''
This class takes in a text and transforms it into a lemmatized/stemmed form.
Author: Justin A. Middleton
'''
from nltk import word_tokenize, pos_tag, map_tag, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
class Lemmatizer():
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
'''
Lemmatizes every word in a sentence and then tokenizes it.
sentence: str
'''
def lemmatize(self, sentence):
tokens = word_tokenize(sentence)
lemmas = self.lemmatizeTokens(tokens)
return " ".join(lemmas)
'''
Turns phrase tokens into lemmatized tokens, which means into some standard format
as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc.
tokens: list of str
'''
def lemmatizeTokens(self, tokens):
tokens_tagged = pos_tag(tokens)
#Get simple POS tags.
tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag))
for word, tag in tokens_tagged]
#Actually lemmatize.
lemmas = []
for token, tag in tokens_simpleTags:
lemmatized = ""
if tag == "VERB":
lemmatized = self.lemmatizer.lemmatize(token, pos='v')
elif tag == "ADJ":
lemmatized = self.lemmatizer.lemmatize(token, pos='a')
elif tag == "ADV":
lemmatized = self.lemmatizer.lemmatize(token, pos='r')
else:
lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
lemmas.append(lemmatized.encode("utf-8"))
return lemmas
'''
Reduce this word down to its most basic form by removing suffixes or common ending
and finding the "root" or "stem" of the word.
Example: "response," "responsive," and "responsivity" all stem from "respons," or
something similar.
'''
def stem(self, tokens):
stemmed = []
for token in tokens:
stem = self.stemmer.stem(token)
stemmed.append(stem.encode("utf-8"))
return stemmed