In [195]:
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import fasttext.util

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.features.preprocessing import ICDDescriptionPreprocessor
from src.features.knowledge import DescriptionKnowledge
from src.features.sequences import SequenceHandler

import string


In [196]:
text_descriptions = ICDDescriptionPreprocessor('../data/D_ICD_DIAGNOSES.csv').load_descriptions()
words = set()
for _,row in text_descriptions.iterrows():
    text_description = row['description']
    text_description = text_description.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    description_words = text_description.split(' ')
    description_words = [str(x).lower().strip() for x in description_words]
    description_words = [x for x in description_words if len(x) > 0]

    words.update(description_words)

len(words)

5809

In [197]:
sequence_df = pd.DataFrame(data={
    'sequence': [
        [ # sequence1
            ['a', 'b'], # visit1
            ['a', 'c'], # visit2
        ], 
        [ # sequence2
            ['a', 'b', 'c'],
            ['a'],
            ['d'],
        ],
        [ # sequence3
            ['a', 'b'], 
            ['a', 'd'], 
        ], 
    ]
})
description_df = pd.DataFrame(data={
    'label': ['a', 'b', 'c', 'd'],
    'description': [
         'THIS! is apple.....',
         'this is: Banana',
         'this is COOL* curly fries',
         'this iS another,description',
    ]
})
result_description_df = pd.DataFrame(data={
    'label': ['a', 'b', 'c', 'd'],
    'description': [
         ['this', 'is', 'apple'],
         ['this', 'is', 'banana'],
         ['this', 'is', 'cool', 'curly', 'fries'],
         ['this', 'is', 'another', 'description'],
    ]
})

handler = SequenceHandler(flatten=True)
split = handler.transform_train_test_split(sequence_df, 'sequence')

combined_x = tf.concat([split.train_x, split.test_x], axis=0)
combined_y = tf.concat([split.train_y, split.test_y], axis=0)

print(combined_x.shape) # (dataset_size, max_length, feature_size)
print(combined_y.shape) # (dataset_size, 1, feature_size)

Transforming splitted sequences to tensors: 100%|██████████| 3/3 [00:00<00:00, 2772.79it/s]
Transforming splitted sequences to tensors: 100%|██████████| 1/1 [00:00<?, ?it/s](4, 2, 4)
(4, 1, 4)



In [198]:
desc_knowledge = DescriptionKnowledge()
desc_knowledge.build_knowledge_from_df(description_df, split.vocab)
desc_knowledge.descriptions

{1: ['this', 'is', 'apple'],
 3: ['this', 'is', 'banana'],
 2: ['this', 'is', 'cool', 'curly', 'fries'],
 0: ['this', 'is', 'another', 'description']}

In [199]:
fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('cc.en.300.bin')
fasttext.util.reduce_model(model, 100)



<fasttext.FastText._FastText at 0x2111a24a9d0>

In [200]:
len(words - set(model.words))

498

In [201]:
max_description_length = max([len(x) for x in desc_knowledge.descriptions.values()])
max_description_length

5

In [202]:
embeddings = {}
concatenated_embeddings = {}

pad_vector = tf.constant(0.0, shape=(100,))
for idx, description_words in desc_knowledge.descriptions.items():
    embeddings[idx] = [
        tf.constant(model.get_word_vector(word)) for word in description_words
    ] + [
        pad_vector for i in range(max_description_length) if i >= len(description_words)
    ]

    concatenated_embeddings[idx] = tf.stack(
        embeddings[idx], axis=0
    )

    

concatenated_embeddings[1].shape # (max_words, embedding_dim)

TensorShape([5, 100])

In [203]:
filter_dim = 16
kernel_dim = 2
stacked = tf.stack([concatenated_embeddings[i] for i in range(len(desc_knowledge.descriptions))], axis=0) # shape: (num_variables, max_words, embedding_dim)
layer = tf.keras.layers.Conv1D(filter_dim, kernel_dim, activation='relu', input_shape=(max_description_length, 100))
layer2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)

print(layer(stacked).shape) # shape: (num_variables, max_words-kernel_dim + 1, filter_dim)
res = layer2(layer(stacked)) # shape: (num_variables, layers, filter_dim), layers = (max_words-kernel_dim + 1 - pool_size + 1) / strides)
print(res.shape)

embedding_matrix = tf.keras.layers.Flatten()(res) # shape: (num_variables, layers*filter_dim)
tf.linalg.matmul(combined_x, embedding_matrix)


(4, 4, 16)
(4, 2, 16)


<tf.Tensor: shape=(4, 2, 32), dtype=float32, numpy=
array([[[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ],
        [0.9737123 , 0.41674018, 0.8597754 , 0.03577794, 0.523233  ,
         0.42020476, 0.48317412, 0.        , 0.        , 0.9359788 ,
         0.3944397 , 0.03953928, 0.        , 0.        , 0.        ,
         0.03241515, 0.05803989, 0.12829792, 0.24835162, 0.08965056,
         0.05661978, 0.24180092, 0.        , 0.        , 0.18219194,
         0.06665231, 0.35654786, 0.24350911, 0.        , 0.10087187,
         0.03620609, 0.        ]],

       [[0.9737123 , 0.41674018, 0.8597754 , 0.035

In [206]:
class MyEmbedding(tf.keras.Model):

    def __init__(self, desc_knowledge, model, max_description_length):
        super(MyEmbedding, self).__init__()
        self.embeddings = {}
        self.concatenated_embeddings = {}
        self.desc_knowledge = desc_knowledge

        pad_vector = tf.constant(0.0, shape=(100,))
        for idx, description_words in desc_knowledge.descriptions.items():
            self.embeddings[idx] = [
                tf.constant(model.get_word_vector(word)) for word in description_words
            ] + [
                pad_vector for i in range(max_description_length) if i >= len(description_words)
            ]

            self.concatenated_embeddings[idx] = tf.stack(
                embeddings[idx], axis=0
            )

        filter_dim = 16
        kernel_dim = 2
        self.stacked = tf.stack([self.concatenated_embeddings[i] for i in range(len(self.desc_knowledge.descriptions))], axis=0) # shape: (num_variables, max_words, embedding_dim)
        
        self.conv_layer = tf.keras.layers.Conv1D(filter_dim, kernel_dim, activation='relu', input_shape=(max_description_length, 100))
        self.pool_layer = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.embedding_matrix = tf.keras.layers.Flatten()(
            self.pool_layer(
                self.conv_layer(self.stacked))) # shape: (num_variables, layers*filter_dim)

        self.final_layer = tf.keras.layers.Dense(16)

    def call(self, values): # values shape: (dataset_size, max_sequence_length, num_used_nodes)
        embedding_representation = tf.linalg.matmul(values, self.embedding_matrix) # shape:
        return self.final_layer(embedding_representation) #(dataset_size, max_sequence_length, embedding_size)


In [207]:
embedding_layer = MyEmbedding(desc_knowledge, model, max_description_length)
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(split.max_length, len(split.vocab))),
    embedding_layer,
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(len(split.vocab), activation='relu'),
])
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), 
    optimizer=tf.optimizers.Adam(), 
    metrics=['CategoricalAccuracy'])

In [208]:
embedding_layer(split.train_x)

<tf.Tensor: shape=(3, 2, 16), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.09915152,  0.12963243,  0.04308442, -0.08373883,
          0.26392236, -0.15848292,  0.04170059,  1.1119654 ,
          0.09878664, -0.42984957, -0.44174978,  0.34599936,
         -0.1638078 , -0.4805175 ,  0.80220634, -0.14489104]],

       [[ 0.09915152,  0.12963243,  0.04308442, -0.08373883,
          0.26392236, -0.15848292,  0.04170059,  1.1119654 ,
          0.09878664, -0.42984957, -0.44174978,  0.34599936,
         -0.1638078 , -0.4805175 ,  0.80220634, -0.14489104],
        [ 0.01124601,  0.02596796, -0.04627941,  0.007903  ,
          0.09927685,  0.02314363,  0.0145088 ,  0.41476563,
         -0.05339865, -0.1507785 , -0.21187967,  0.05877149,
         -0.08967217, -0.153

In [209]:
model.fit(split.train_x, split.train_y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x211377ed460>

In [210]:
embedding_layer(split.train_x)

<tf.Tensor: shape=(3, 2, 16), dtype=float32, numpy=
array([[[ 0.02547146,  0.06282518,  0.04444942,  0.00735468,
          0.0177834 , -0.04009539,  0.02137122,  0.02424395,
          0.01750123, -0.04729794,  0.00691855,  0.06655074,
         -0.0437462 ,  0.00925681,  0.02784125, -0.02378744],
        [ 0.29274797,  0.5790432 ,  0.33464086, -0.02503546,
          0.33848804, -0.48976815,  0.18594624,  1.266968  ,
          0.22155604, -0.7139135 , -0.38516307,  0.79723334,
         -0.46084324, -0.40556607,  0.99010473, -0.2537799 ]],

       [[ 0.29274797,  0.5790432 ,  0.33464086, -0.02503546,
          0.33848804, -0.48976815,  0.18594624,  1.266968  ,
          0.22155604, -0.7139135 , -0.38516307,  0.79723334,
         -0.46084324, -0.40556607,  0.99010473, -0.2537799 ],
        [ 0.07831022,  0.18636012,  0.06073102,  0.02788023,
          0.13332018, -0.08745211,  0.06674856,  0.47177342,
         -0.00957186, -0.25893375, -0.19288921,  0.22399798,
         -0.19748554, -0.127