In [2]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from keras import Sequential
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

In [3]:
data_train = pd.read_csv(r'..\..\data\train\data-train-lstm.csv')
data_test = pd.read_csv(r'..\..\data\test\data-test-lstm.csv')

data_train.abstract = data_train.abstract.astype(str)
data_train.journal = data_train.journal.astype(str)
data_test.abstract = data_test.abstract.astype(str)
data_test.journal = data_test.journal.astype(str)

In [3]:
# data_train = data_train.sample(frac=0.2)
# data_test = data_test.sample(frac=0.2)

In [4]:
x_train = data_train['abstract']
y_train = data_train['journal']
x_test = data_test['abstract']
y_test = data_test['journal']

Construcción de vocabulario

In [5]:
def split_words(df):
    text_split = []

    for text in df:
        for s in text.split():
            text_split.append(s)

    return text_split

docs = split_words(x_train)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [6]:
MAXLEN = 400
VOCAB_SIZE = len(tokenizer.word_index) + 1


In [8]:
VOCAB_SIZE

390053

Padding de artículos

In [9]:
x_train_padded = pad_sequences(tokenizer.texts_to_sequences(x_train), padding="post", maxlen=MAXLEN)
x_test_padded = pad_sequences(tokenizer.texts_to_sequences(x_test), padding="post", maxlen=MAXLEN)

In [10]:
x_train_padded.shape

(146385, 400)

Asignación de pesos

In [None]:
#PubMed downloaded from http://evexdb.org/pmresources/vec-space-models/
f = open('PubMed-w2v.txt')
cabecera = f.readline()
EMBED_SIZE =  int(cabecera.split()[1])
print(EMBED_SIZE)

embeddings_index = {}

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


embedding_matrix = np.zeros((VOCAB_SIZE, EMBED_SIZE))
for word, index in tokenizer.word_index.items():
    if index > VOCAB_SIZE - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
# word2vec = Word2Vec(vector_size=EMBED_SIZE, workers=-1) #son muchos 1000 dimensiones
# word2vec.build_vocab(docs)
# word2vec.train(docs, total_examples=len(docs), epochs=32)

In [None]:
# EMBED_SIZE = 300
# embedding_matrix = np.zeros((VOCAB_SIZE, EMBED_SIZE))

# for word, i in tokenizer.word_index.items():
#     if word in word2vec.wv:
#         embedding_matrix[i] = word2vec.wv[word]

Indexación de clases

In [10]:
classes = set(y_train.unique())
class_to_index = dict((c,i) for i, c in enumerate(classes))
# index_to_class = dict((v,k) for k, v in class_to_index.items())

names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(y_train)
test_labels = names_to_ids(y_test)

Construcción y compilación del modelo

In [None]:
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.metrics import SparseTopKCategoricalAccuracy

In [None]:
# # Prueba 1
# model = Sequential([
#     Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_SIZE, trainable=False,
#     weights=[embedding_matrix], input_length=MAXLEN),
#     LSTM(128, return_sequences=True, dropout=0.2),
#     LSTM(64, dropout=0.2),
#     Dense(32, activation="relu"),
#     Dense(650, activation="softmax")
# ])

In [None]:
# # Prueba 2
# model = Sequential([
#     Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_SIZE, trainable=False,
#     weights=[embedding_matrix], input_length=MAXLEN),
#     Bidirectional(LSTM(128, return_sequences=True, dropout=0.2)),
#     Bidirectional(LSTM(64, dropout=0.2)),
#     Dense(32, activation="relu"),
#     Dense(650, activation="softmax")
# ])

In [None]:
# # Prueba 3
# model = Sequential([
#     Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_SIZE, trainable=True,
#     weights=[embedding_matrix], input_length=MAXLEN),
#     Bidirectional(LSTM(128, return_sequences=True, dropout=0.2)),
#     Bidirectional(LSTM(64, dropout=0.2)),
#     Dense(650, activation="softmax")
# ])

In [None]:
# Prueba 4
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_SIZE, trainable=True,
    weights=[embedding_matrix], input_length=MAXLEN),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.2)),
    Bidirectional(LSTM(64, dropout=0.2)),
    Dense(650, activation="softmax")
])

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=[SparseTopKCategoricalAccuracy(k=10)])

model.summary

In [None]:
model.fit(x_train_padded, train_labels, validation_data=[x_test_padded, test_labels], epochs=1, batch_size=256, verbose=1)

In [None]:
pred = model.predict(x_test_padded)
m = SparseTopKCategoricalAccuracy(k=1)
y_true = np.array(test_labels, np.float32).ravel()
m.update_state(test_labels,pred)
m.result().numpy()