In [1]:
from keras.layers import Input, Dense, LSTM, Embedding
import numpy as np
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences


In [3]:
with open('../Exercises/datasets/W2V/20news-train-raw.txt') as f:
    contents_train = [(content.split('<fff>')[0], \
                 content.split('<fff>')[1], \
                 content.split('<fff>')[2] \
                ) for content in f.read().splitlines()]

with open('../Exercises/datasets/W2V/20news-test-raw.txt') as f:
    contents_test = [(content.split('<fff>')[0], \
                 content.split('<fff>')[1], \
                 content.split('<fff>')[2] \
                ) for content in f.read().splitlines()]
                
with open('../Exercises/datasets/W2V/vocab-raw.txt') as f:
    vocab = f.read().splitlines()

In [4]:
word2id = dict([(word, index+2) for (index, word) in enumerate(vocab)])
word2id['PAD'] = 1
id2word = {index: word for word, index in word2id.items()}
print(len(word2id))
print(len(vocab))

18989
18988


In [5]:
def get_pad_data(contents):
    norm_bible = [(label, doc_id, len(words.split()[:500]), words.split()[:SENT_LENGTH]) for label, doc_id, words in contents]

    encoded_data = [[word2id[w] if w in word2id.keys() 
                                else 1 for w in words]
                                for label, doc_id, len_words, words in norm_bible]

    pad_data = pad_sequences(encoded_data, maxlen=SENT_LENGTH, padding='post')
    data_labels = np.array([int(content[0]) for content in contents])
    return pad_data, data_labels

SENT_LENGTH = 500
train_padded_data, train_data_labels = get_pad_data(contents=contents_train)
test_padded_data, test_data_labels = get_pad_data(contents=contents_test)


In [7]:
with open('../Exercises/datasets/W2V/vocab-raw.txt') as f:
    vocab = f.read().splitlines()
with open('../Exercises/datasets/W2V/20news-train-encoded.txt') as f:
    train_data = f.read().splitlines()
    train_padded_data = [contents.split('<fff>')[3].split() for contents in train_data]
    train_padded_data = [[int(ind) for ind in content] for content in train_padded_data]
    train_data_labels = [int(contents.split('<fff>')[0]) for contents in train_data]

with open('../Exercises/datasets/W2V/20news-test-encoded.txt') as f:
    test_data = f.read().splitlines()
    test_padded_data = [contents.split('<fff>')[3].split() for contents in test_data]
    test_padded_data = [[int(ind) for ind in content] for content in test_padded_data]
    test_data_labels = [int(contents.split('<fff>')[0]) for contents in test_data]

SENT_LENGTH = 500
print(train_padded_data[0][:100])

[7541, 10894, 10894, 10784, 4333, 17490, 16285, 2063, 2552, 7027, 2553, 14428, 16365, 3284, 1788, 11609, 2246, 14239, 17043, 2552, 9860, 7027, 2552, 3284, 11609, 7161, 1788, 4754, 6868, 16966, 572, 2326, 352, 122, 991, 276, 7886, 5909, 18566, 12432, 10784, 4739, 3690, 17490, 16406, 1, 10784, 4333, 17490, 10384, 573, 2354, 11685, 2552, 14428, 2063, 2552, 2354, 11685, 14428, 10127, 11370, 122, 5394, 351, 17947, 90, 3, 2553, 14428, 1788, 12244, 2553, 12434, 17715, 7501, 7541, 14267, 7453, 5307, 7247, 3527, 16155, 2164, 2510, 12483, 2553, 1, 2358, 2671, 7541, 16859, 7501, 7541, 14267, 7453, 8964, 16859, 17714, 18612]


In [8]:
print(len(train_padded_data))
print(len(train_data_labels))
print(len(test_padded_data))
print(len(test_data_labels))

11314
11314
7532
7532


In [9]:
vocab_size = len(vocab)
embedding_size = 300
LSTM_size = 50
batch_size = 50
NUM_CLASSES = 20

model = Sequential()
model.add(Input(shape=[SENT_LENGTH], name='Input'))
model.add(Embedding(vocab_size+2, embedding_size, input_length=SENT_LENGTH, mask_zero=True))
model.add(LSTM(LSTM_size))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_padded_data, train_data_labels, epochs=40, batch_size=batch_size, 
                        validation_data=(test_padded_data, test_data_labels), verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 300)          5697000   
                                                                 
 lstm_1 (LSTM)               (None, 50)                70200     
                                                                 
 dense_1 (Dense)             (None, 20)                1020      
                                                                 
Total params: 5,768,220
Trainable params: 5,768,220
Non-trainable params: 0
_________________________________________________________________
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40

KeyboardInterrupt: 