/
word_preprocessing.py
38 lines (31 loc) · 1.42 KB
/
word_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import contractions
import nltk
import re
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
def preprocess_list(list_of_summaries):
list_preprocessed_summary_text = []
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
for summary_text in list_of_summaries:
summary_text = summary_text.lower()
summary_text = summary_text.replace('amp;amp;','')
summary_text = summary_text.replace('-','')
summary_text = re.sub(r'[^A-Za-z0-9&\-]+', ' ', summary_text)
summary_text = contractions.fix(summary_text)
summary_tokens = summary_text.split(' ')
summary_tokens = [word for word in summary_tokens if not word in stop_words]
preprocessed_summary_tokens = []
for token in summary_tokens:
token = unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8', 'ignore')
if token.isdigit():
continue
if token[-2:] == 'ed' or token == 'wrote':
token = lemmatizer.lemmatize(token,'v')
if token[-1:] == 's':
token = lemmatizer.lemmatize(token)
preprocessed_summary_tokens.append(token)
summary_text = ' '.join(preprocessed_summary_tokens)
list_preprocessed_summary_text.append(summary_text)
return list_preprocessed_summary_text