-
Notifications
You must be signed in to change notification settings - Fork 1
/
bidirectional_LSTM_with_glove_keras.py
107 lines (84 loc) · 3.21 KB
/
bidirectional_LSTM_with_glove_keras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 3 13:24:01 2018
@author: jaydeep thik
"""
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers, models, optimizers
import numpy as np
import os
def load_data():
main_dir = "F:/machine learning/code/NLP_word_embedding/imdb_SA_embeddings/aclImdb"
train_dir = os.path.join(main_dir, 'train')
labels=[]
text = []
for label in ['pos', 'neg']:
working_dir = os.path.join(train_dir, label)
for text_file in os.listdir(working_dir):
f= open(os.path.join(working_dir, text_file), encoding='utf8')
text.append(f.read())
f.close()
if label=='pos':
labels.append(1)
else:
labels.append(0)
return text, labels
## loading GloVe
def load_glove(dim):
glove_dir = "F:/machine learning/code/NLP_word_embedding/data"
embedding = {}
f = open(os.path.join(glove_dir, 'glove.6B.50d.txt'), encoding='utf8')
for line in f:
values = line.split()
word = values[0]
coeff = np.asarray(values[1:], dtype = 'float32')
embedding[word]=coeff
f.close()
#creating embedding matrix
embedding_dim = dim
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, idx in word_index.items():
if idx < max_words:
embedding_vector = embedding.get(word)
if embedding_vector is not None:
embedding_matrix[idx]= embedding_vector
return embedding_matrix
maxlen = 500
training_samples = 10000
validation_samples = 5000
max_words = 10000
#tokenizing
tokenizer = Tokenizer(num_words=max_words)
text, labels = load_data()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
#get word indices of all the unique words in the text
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print(data.shape)
print(labels.shape)
indices = np.array(range(labels.shape[0]))
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
X_train = data[:training_samples]
y_train = labels[:training_samples]
X_val = data[training_samples:training_samples+validation_samples]
y_val = labels[training_samples:training_samples+validation_samples]
embedding_dim = 50
#model
model = models.Sequential()
model.add(layers.Embedding(max_words, embedding_dim, input_length=maxlen))
#model.add(layers.Flatten())
#model.add(layers.Dense(32, activation='relu'))
model.add(layers.Bidirectional(layers.LSTM(16, return_sequences=True)))
model.add(layers.LSTM(32, return_sequences=False))
model.add(layers.Dense(1, activation='sigmoid'))
#prevent overriding of the embedding matrix
embedding_matrix = load_glove(embedding_dim)
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, batch_size=128, epochs=30, validation_data=(X_val, y_val))