# Model's architecture

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [37]:
class ehrModel(nn.Module):
    
    def __init__(self, vocab_size, emb_dim, kernel_size, k_max):
        super(ehrModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim 
        self.kernel_size = kernel_size
        self.k_max = k_max #k value for the k-max pooling to uniform dimension among temporal matrices
        
        padding = int((kernel_size - 1) / 2) #input sequence length = output sequence length (if stride=dilation=1)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.cnn = nn.Conv1d(emb_dim, emb_dim, kernel_size=kernel_size, padding=padding)
        self.FC = nn.Linear(emb_dim * k_max, emb_dim * k_max)
    
    def kmaxPooling(self, x, k_max):
        return torch.topk(x, k_max, sorted=False)[0]
    
    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.view(1, self.emb_dim, -1) #reshape the temporal matrix with n_channels=emb_dim
        out = F.relu(self.cnn(embeds))
        out = F.max_pool1d(out, kernel_size)
        out = F.relu(self.cnn(out))
        out = self.kmaxPooling(out, self.k_max)
        out = out.view(-1, self.emb_dim * self.k_max) #[N, [emb_dim * temporal_dim]] N is the batch_size
        #we basically concatenate the k-max selected from each embedding dimension
        out = F.relu(self.FC(out))

        in_vect = out
        l = out.shape[1]
        
        ehrAE = ae(l)
        encoded_vect, out = ehrAE(out)

        return(in_vect, out, encoded_vect)

In [38]:
class ae(nn.Module):
    def __init__(self, dim):
        super(ae, self).__init__()
        self.dim = dim #input dimension (vector length)
        h_dim = int(dim / 2)
        h2_dim = int(h_dim / 2)
        self.encoder = nn.Sequential(
        nn.Linear(dim, h_dim),
        nn.ReLU(True),
        nn.Linear(h_dim, h2_dim),
        nn.ReLU(True))
        self.decoder = nn.Sequential(
        nn.Linear(h2_dim, h_dim),
        nn.ReLU(True),
        nn.Linear(h_dim, dim),
        nn.ReLU(True))
        
    def forward(self, x):
        x = self.encoder(x)
        encoded_vect = x
        x = self.decoder(x)
        return(encoded_vect, x)

In [40]:
#supponiamo di avere tre pazienti con EHRs di lunghezze diverse 
#vogliamo applicare CBOW considerando il contesto K (user-defined) separatamente per ogni "sentence",
#cioe' separatamente per ogni paziente
#Creiamo one list with different arrays storing the tokens of the medical terms
# one array for each patient
P1 = "A wiki is run using wiki software, otherwise known as a wiki engine.\
         A wiki engine is a type of content management system, but it differs\
         from most other such systems, including blog software, in that the\
         content is created without any defined owner or leader, and wikis have\
         little implicit structure, allowing structure to emerge according to the\
         needs of the users."

P2 = "The online encyclopedia project Wikipedia is by far the most popular wiki-based\
         website, and is one of the most widely viewed sites of any kind in the world,\
         having been ranked in the top ten since 2007."

P3 = "Wikipedia is not a single wiki but rather a collection of hundreds of wikis,\
         one for each language. There are tens of thousands of other wikis in use, both\
         public and private, including wikis functioning as knowledge management resources,\
         notetaking tools, community websites and intranets. The English-language Wikipedia\
         has the largest collection of articles; as of September 2016, it had over five\
         million articles."

##ARRAY with tokens from patients
H = [P1.split(), P2.split(), P3.split()]

##Fun1: creiamo il nostro dictionary (associamo univocamente un token a un intero)
##Input: array of sentences
def create_dict(array_sentences): 
    word_to_ix = {}
    ix_to_word = {}
    
    for _, j in enumerate(array_sentences):
        for word in j:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                ix_to_word[len(ix_to_word)] = word
    return word_to_ix, ix_to_word

word_to_ix, ix_to_word = create_dict(H)

data = [[word_to_ix[mt] for mt in a] for _, a in enumerate(H)]

vocab_size = len(word_to_ix)
embedding_dim = 10
kernel_size = 3
k_max = 10

model = ehrModel(vocab_size, embedding_dim, kernel_size, k_max)

optimizer = torch.optim.Adam(
    model.parameters(), lr=0.001, weight_decay=1e-5)

##in order to compute the gradient with respect to both the input and the output:
def mse_loss(input, target):
    return torch.sum((input - target)**2) / input.data.nelement()

for epoch in range(5):
    for i, d in enumerate(data):
        print("vect {0:1d}".format(i))
        in_vect, out, encoded_vect = model(torch.tensor(d))
    
        MSE_loss = mse_loss(in_vect, out)
        print("MSE loss: {0}".format(MSE_loss))
    
        optimizer.zero_grad()
        MSE_loss.backward()
        optimizer.step()

vect 0
MSE loss: 0.016088027507066727
vect 1
MSE loss: 0.008628617972135544
vect 2
MSE loss: 0.0070979357697069645
vect 0
MSE loss: 0.007977444678544998
vect 1
MSE loss: 0.006150913890451193
vect 2
MSE loss: 0.005911903455853462
vect 0
MSE loss: 0.005895484704524279
vect 1
MSE loss: 0.00632022600620985
vect 2
MSE loss: 0.004825858399271965
vect 0
MSE loss: 0.005441510118544102
vect 1
MSE loss: 0.00419011153280735
vect 2
MSE loss: 0.0044592078775167465
vect 0
MSE loss: 0.004393381532281637
vect 1
MSE loss: 0.0043781534768640995
vect 2
MSE loss: 0.0038139401003718376
