-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_extraction.py
43 lines (41 loc) · 1.4 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import nltk
import re
def extract_feature(text, syn_list, reg=None):
sent = nltk.sent_tokenize(text.lower())
words = [nltk.word_tokenize(s) for s in sent]
stop_words = set(nltk.corpus.stopwords.words('english'))
simp_words = []
for s in words:
simp_words.append([x for x in s if not(x in (stop_words.union(set(['-', ':', '.'])) - set(['from', 'to'])))])
if (reg):
reg = re.compile(reg)
for token in syn_list:
for s in simp_words:
# OLD
#if (token in s):
# NEW
tok_split = token.split()
if (tok_split[0] in s):
cond = True
index = s.index(tok_split[0])
for w in tok_split[1:]:
index += 1
try:
if (w != s[index]):
cond = False
break
except:
break
if (not cond):
continue
try:
if (reg):
x = reg.match(s[index+1]).group()
if (x != ''):
return x
else:
continue
return s[index+1]
except:
continue
return False