In [None]:
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import fasttext.util

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.features.preprocessing import ICDDescriptionPreprocessor
from src.features.knowledge import DescriptionKnowledge
from src.features.sequences import SequenceHandler

import string


In [None]:
text_descriptions = ICDDescriptionPreprocessor('../data/D_ICD_DIAGNOSES.csv').load_descriptions()
words = set()
for _,row in text_descriptions.iterrows():
    text_description = row['description']
    text_description = text_description.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    description_words = text_description.split(' ')
    description_words = [str(x).lower().strip() for x in description_words]
    description_words = [x for x in description_words if len(x) > 0]

    words.update(description_words)

len(words)

In [None]:
sequence_df = pd.DataFrame(data={
    'sequence': [
        [ # sequence1
            ['a', 'b'], # visit1
            ['a', 'c'], # visit2
        ], 
        [ # sequence2
            ['a', 'b', 'c'],
            ['a'],
            ['d'],
        ],
        [ # sequence3
            ['a', 'b'], 
            ['a', 'd'], 
        ], 
    ]
})
description_df = pd.DataFrame(data={
    'label': ['a', 'b', 'c', 'd'],
    'description': [
         'THIS! is apple.....',
         'this is: Banana',
         'this is COOL* curly fries',
         'this iS another,description',
    ]
})
result_description_df = pd.DataFrame(data={
    'label': ['a', 'b', 'c', 'd'],
    'description': [
         ['this', 'is', 'apple'],
         ['this', 'is', 'banana'],
         ['this', 'is', 'cool', 'curly', 'fries'],
         ['this', 'is', 'another', 'description'],
    ]
})

handler = SequenceHandler(flatten=True)
split = handler.transform_train_test_split(sequence_df, 'sequence')

combined_x = tf.concat([split.train_x, split.test_x], axis=0)
combined_y = tf.concat([split.train_y, split.test_y], axis=0)

print(combined_x.shape) # (dataset_size, max_length, feature_size)
print(combined_y.shape) # (dataset_size, 1, feature_size)

In [None]:
desc_knowledge = DescriptionKnowledge()
desc_knowledge.build_knowledge_from_df(description_df, split.vocab)
desc_knowledge.descriptions

In [None]:
fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('cc.en.300.bin')
model.get_dimension()
#fasttext.util.reduce_model(model, 100)

In [None]:
len(words - set(model.words))

In [None]:
max_description_length = max([len(x) for x in desc_knowledge.descriptions.values()])
max_description_length

In [None]:
embeddings = {}
concatenated_embeddings = {}

pad_vector = tf.constant(0.0, shape=(100,))
for idx, description_words in desc_knowledge.descriptions.items():
    embeddings[idx] = [
        tf.constant(model.get_word_vector(word)) for word in description_words
    ] + [
        pad_vector for i in range(max_description_length) if i >= len(description_words)
    ]

    concatenated_embeddings[idx] = tf.stack(
        embeddings[idx], axis=0
    )

    

concatenated_embeddings[1].shape # (max_words, embedding_dim)

In [None]:
filter_dim = 16
kernel_dim = 2
stacked = tf.stack([concatenated_embeddings[i] for i in range(len(desc_knowledge.descriptions))], axis=0) # shape: (num_variables, max_words, embedding_dim)
layer = tf.keras.layers.Conv1D(filter_dim, kernel_dim, activation='relu', input_shape=(max_description_length, 100))
layer2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)

print(layer(stacked).shape) # shape: (num_variables, max_words-kernel_dim + 1, filter_dim)
res = layer2(layer(stacked)) # shape: (num_variables, layers, filter_dim), layers = (max_words-kernel_dim + 1 - pool_size + 1) / strides)
print(res.shape)

embedding_matrix = tf.keras.layers.Flatten()(res) # shape: (num_variables, layers*filter_dim)
tf.linalg.matmul(combined_x, embedding_matrix)


In [None]:
class MyEmbedding(tf.keras.Model):

    def __init__(self, desc_knowledge, model, max_description_length):
        super(MyEmbedding, self).__init__()
        self.embeddings = {}
        self.concatenated_embeddings = {}
        self.desc_knowledge = desc_knowledge

        pad_vector = tf.constant(0.0, shape=(100,))
        for idx, description_words in desc_knowledge.descriptions.items():
            self.embeddings[idx] = [
                tf.constant(model.get_word_vector(word)) for word in description_words
            ] + [
                pad_vector for i in range(max_description_length) if i >= len(description_words)
            ]

            self.concatenated_embeddings[idx] = tf.stack(
                embeddings[idx], axis=0
            )

        filter_dim = 16
        kernel_dim = 2
        self.stacked = tf.stack([self.concatenated_embeddings[i] for i in range(len(self.desc_knowledge.descriptions))], axis=0) # shape: (num_variables, max_words, embedding_dim)
        
        self.conv_layer = tf.keras.layers.Conv1D(filter_dim, kernel_dim, activation='relu', input_shape=(max_description_length, 100))
        self.pool_layer = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.embedding_matrix = tf.keras.layers.Flatten()(
            self.pool_layer(
                self.conv_layer(self.stacked))) # shape: (num_variables, layers*filter_dim)

        self.final_layer = tf.keras.layers.Dense(16)

    def call(self, values): # values shape: (dataset_size, max_sequence_length, num_used_nodes)
        embedding_representation = tf.linalg.matmul(values, self.embedding_matrix) # shape:
        return self.final_layer(embedding_representation) #(dataset_size, max_sequence_length, embedding_size)


In [None]:
embedding_layer = MyEmbedding(desc_knowledge, model, max_description_length)
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(split.max_length, len(split.vocab))),
    embedding_layer,
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(len(split.vocab), activation='relu'),
])
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), 
    optimizer=tf.optimizers.Adam(), 
    metrics=['CategoricalAccuracy'])

In [None]:
embedding_layer(split.train_x)

In [None]:
model.fit(split.train_x, split.train_y, epochs=100)

In [None]:
embedding_layer(split.train_x)

In [None]:
embedding_size = 16
embeddings = {}
for name, idx in desc_knowledge.vocab.items():
    embeddings[idx] = tf.Variable(
        initial_value=tf.random.normal(shape=(1,embedding_size)),
        trainable=True,
        name=name,
    )
            
for name, idx in desc_knowledge.words_vocab.items():
    embeddings[idx] = tf.Variable(
        initial_value=tf.constant(model.get_word_vector(name), shape=(1,model.get_dimension())),
        trainable=False,
        name=name,
    )
        
concatenated_embeddings = tf.Variable(
    tf.expand_dims(
        tf.concat(
            [embeddings[idx] for idx in range(len(desc_knowledge.vocab))], 
            axis=0),
        1),
    trainable=True,
    name='concatenated_embeddings',
)

concatenated_embeddings.shape # (num_variables, 1, embedding_size)

In [None]:
word_embeddings = {}
for idx, words in desc_knowledge.descriptions_set.items():
    word_idx = set([desc_knowledge.words_vocab[x] for x in words])
    id_neighbour_embeddings = [
        embeddings[x]  if (x in word_idx) 
        else tf.constant(0, shape=(embeddings[len(desc_knowledge.vocab)].shape), dtype='float32')
        for x in range(len(desc_knowledge.vocab), len(desc_knowledge.vocab) + len(desc_knowledge.words_vocab))
    ]
    word_embeddings[idx] = tf.concat(id_neighbour_embeddings, axis=0)

all_neighbour_embeddings = [
    word_embeddings[idx] for idx in range(len(desc_knowledge.vocab))
]
concatenated_neighbour_embeddings = tf.Variable(
    tf.concat([all_neighbour_embeddings], axis=1),
    trainable=True,
    name='concatenated_description_embeddings',
)

concatenated_neighbour_embeddings.shape # (num_variables, num_words, word_embedding_dim)

In [None]:
hidden_size = 32

w1 = tf.keras.layers.Dense(hidden_size)
w2 = tf.keras.layers.Dense(hidden_size)
u = tf.keras.layers.Dense(1)
score = u(tf.nn.tanh(
    w1(concatenated_embeddings) + w2(concatenated_neighbour_embeddings)
)) # shape: (num_features, num_words, 1)
print(score.shape)

attention_weights = tf.nn.softmax(score, axis=0) # shape: (num_features, num_words, 1)
print(attention_weights.shape)

context_vector = attention_weights * concatenated_neighbour_embeddings  # shape: (num_features, num_words, embedding_size)
print(context_vector.shape)

context_vector = tf.reduce_sum(context_vector, axis=1)  # shape: (num_features, embedding_size)
print(context_vector.shape)
