-
-
Notifications
You must be signed in to change notification settings - Fork 124
/
text_feature_extraction.py
139 lines (120 loc) · 4.9 KB
/
text_feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
RMDL: Random Multimodel Deep Learning for Classification
* Copyright (C) 2018 Kamran Kowsari <kk7nc@virginia.edu>
* Last Update: 04/25/2018
* This file is part of RMDL project, University of Virginia.
* Free to use, change, share and distribute source code of RMDL
* Refrenced paper : RMDL: Random Multimodel Deep Learning for Classification
* Refrenced paper : An Improvement of Data Classification using Random Multimodel Deep Learning (RMDL)
* Comments and Error: email: kk7nc@virginia.edu
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk
nltk.download("stopwords")
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from RMDL import Global as G
cachedStopWords = stopwords.words("english")
def transliterate(line):
cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
tr = dict([(a[0], a[1]) for (a) in cedilla2latin])
new_line = ""
for letter in line:
if letter in tr:
new_line += tr[letter]
else:
new_line += letter
return new_line
def text_cleaner(text,
deep_clean=False,
stem= True,
stop_words=True,
translite_rate=True):
rules = [
{r'>\s+': u'>'}, # remove spaces after a tag opens or closes
{r'\s+': u' '}, # replace consecutive spaces
{r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br>
{r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>...
{r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>...
{r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head>
{r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts
{r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags
{r'^\s+': u''} # remove spaces at the beginning
]
if deep_clean:
text = text.replace(".", "")
text = text.replace("[", " ")
text = text.replace(",", " ")
text = text.replace("]", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("\"", "")
text = text.replace("-", " ")
text = text.replace("=", " ")
text = text.replace("?", " ")
text = text.replace("!", " ")
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
text = text.strip()
text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
text = re.sub("(^|\W)\d+($|\W)", " ", text)
if translite_rate:
text = transliterate(text)
if stem:
text = PorterStemmer().stem(text)
text = WordNetLemmatizer().lemmatize(text)
if stop_words:
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
text = [w for w in word_tokens if not w in stop_words]
text = ' '.join(str(e) for e in text)
else:
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
text = text.strip()
return text.lower()
def loadData_Tokenizer(X_train, X_test,GloVe_DIR,MAX_NB_WORDS,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM):
np.random.seed(7)
text = np.concatenate((X_train, X_test), axis=0)
text = np.array(text)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index
text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Found %s unique tokens.' % len(word_index))
indices = np.arange(text.shape[0])
# np.random.shuffle(indices)
text = text[indices]
print(text.shape)
X_train = text[0:len(X_train), ]
X_test = text[len(X_train):, ]
embeddings_index = {}
f = open(GloVe_DIR, encoding="utf8")
for line in f:
values = line.split()
word = values[0]
try:
coefs = np.asarray(values[1:], dtype='float32')
except:
pass
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))
return (X_train, X_test, word_index,embeddings_index)
def loadData(X_train, X_test):
vectorizer_x = TfidfVectorizer()
X_train = vectorizer_x.fit_transform(X_train).toarray()
X_test = vectorizer_x.transform(X_test).toarray()
return (X_train,X_test)