-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_nltk.py
59 lines (51 loc) · 1.93 KB
/
text_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
"""
The text module extracts words from text using punktuation and word tokenizers,
rejects common stopwords, and lemmatizes each word.
Example:
>>> from text import lemma_tokenize
>>> lemma_tokenize('100% of your donation funds medical care for patients around the world.')
['100', '%', 'donation', 'fund', 'medical', 'care', 'patient', 'around', 'world']
Author: Dirk Neumann
"""
import nltk
import nltk.tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.api import StringTokenizer
def lemma_tokenize(paragraph):
lmtzr = WordNetLemmatizer()
try:
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
except LookupError:
nltk.download('wordnet')
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def tokenize(paragraph):
try:
detector = tokenize.detector
except AttributeError:
try:
detector = nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
nltk.download('punkt')
detector = nltk.data.load('tokenizers/punkt/english.pickle')
tokenize.detector = detector
return [
[
word
for word in nltk.tokenize.word_tokenize(sentence)
if word not in stopwords()
]
for sentence in detector.tokenize(paragraph.strip())
]
def stopwords():
try:
stop_words = stopwords.stop_words
except AttributeError:
try:
stop_words = nltk.corpus.stopwords.words('english')
except LookupError:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['-', ':', '.', '\'', '\',', ',', '#', '/', '@', '.,', '(', ')', 'RT', 'I', 'I''m'])
stopwords.stop_words = stop_words
return stop_words