In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

1) __Embedding__ of medical terms (`nn.Embedding(vocab_size, embedding_dim)`):

$$
E \in \mathbb{R}^{T\times d}
$$

where $T$ is the time at which the medical terms were collected and $d$ is the _embedding dimension_.

We obtain a tensor of size:
```
torch.Size([T, d])
```

2) __Multidimensional matrix__

$$
W \in \mathbb{R}^{1\times T \times d}
$$

Basically we have a vector of length $T$ with $d$ channels.

```
embeds = embeds.view(1, self.embedding_dim, -1) 
```

In this way we prepare the tensor for the 1-dimensional convolution, which takes as input $(N,C_{in},L_{in})$, where $N$ is the batch size, $C_{in}$ is the number of input channels and $L_{in}$ is the sequence length. 

```
torch.Size([1, d, T])
```

3) __Convolution__

One-dimensional convolution 

```
self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=kernel_size, padding=(kernel_size - 1)/2)
```

With padding equal to $\frac{(k-1)}{2}$, where $k$ is the filter dimension, we obtain an output with length equal to the input (i.e. $L_{in} = L_{out}$), if _stride_ and _dilation_ are equal to 1.

The `Conv1d` class is instantiated with the input and output channels as the first two arguments.

Hence, we obtain a tensor:
```
torch.Size([1, d, T])
```

__Remark1:__ in this way the channels, i.e. the embedding dimensions, are kept separate. This can be useful if we try to preserve the original dimension as long as possible. We need to remember this when we input the data to the autoencoder.

__Remark2:__ all these passages are done with `batch_size=1` in order to preserve as long as possible the different time stamps. 

__Remark3:__ check the classical architecture of 1-dimensional CNN.

__Remark4:__ one way to solve the different dimensions problem is to apply a convolution that makes the output bigger (_dilation_?).

In [130]:
class ehrModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, kernel_size, k_max):
        super(ehrModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_size = kernel_size
        self.k_max = k_max
        padding = int((kernel_size-1)/2)
        
        #Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        ##Convolution
        self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=kernel_size, padding=padding)
        self.FC = nn.Linear(embedding_dim, embedding_dim)
        
    def forward(self, x):
        embeds = self.embedding(x)
        embeds_mat = embeds
        print(embeds[0])
        print("Embedding dimension: {0}".format(embeds.shape))
        embeds = embeds.view(1, self.embedding_dim, -1)
        print("Reshaping before convolution: {0}".format(embeds.shape))
        out = self.cnn(embeds)
        print(out[0][0])
        print("CNN: {0}".format(out.shape))
        out = F.relu(out)
        print("ReLU: {0}".format(out.shape))
        out = F.max_pool1d(out, kernel_size) 
        print("Pooling: {0}".format(out.shape))
        out = self.cnn(out)
        print(out[0][0])
        out = F.relu(out)
        print("RELU(CNN): {0}".format(out.shape))
        out = out.view(-1, embedding_dim)
        print("Reshape: {0}".format(out.shape))
        out = self.kmaxPooling(out, self.k_max)[0]
        print("KMAXpool: {0}".format(out.shape))
        out = out.view(-1, self.embedding_dim)
        print("Reshaping: {0}".format(out.shape))
        out = F.relu(self.FC(out))
        print("ReLU + FC: {0}".format(out.shape))
        pre_out = out
        out = out.view(-1)
        print("Before AE: {0}".format(out.shape))
        print(embeds.shape[2])
        ehrAE = ae(self.embedding_dim, embeds.shape[2])
        encoded_vect, out = ehrAE(out)
        print("Encoded: {0}".format(encoded_vect.shape))
        print("Reconstruction: {0}".format(out.shape))
        
        return(embeds_mat, out)
        
    def kmaxPooling(self, x, k):
        return torch.topk(x, k, dim=0, sorted=False) #returns 

In [181]:
a = torch.tensor([[[1,2,3,4], [4,5,6,2]], [[1,2,3,4], [4,5,6,2]]])
print(a.shape)

a = torch.topk(a, 3)[0]
print(a, a.shape)
a = a.view(-1, 6)
a.shape

torch.Size([2, 2, 4])
tensor([[[ 4,  3,  2],
         [ 6,  5,  4]],

        [[ 4,  3,  2],
         [ 6,  5,  4]]]) torch.Size([2, 2, 3])


torch.Size([2, 6])

In [131]:
##Autoencoder
class ae(nn.Module):
    def __init__(self, d, c_size):
        super(ae, self).__init__()
        self.c_size = c_size
        self.encoder = nn.Sequential(
        nn.Linear(d * d, int(d/2)),
        nn.ReLU(True),
        nn.Linear(int(d/2), 1),
        nn.ReLU(True))
        self.decoder = nn.Sequential(
        nn.Linear(1, int(d/2)),
        nn.ReLU(True),
        nn.Linear(int(d/2), d * d),
        nn.ReLU(True),
        nn.Linear(d * d, c_size * d))
        
    def forward(self, x):
        x = self.encoder(x)
        encoded_vect = x
        x = self.decoder(x)
        return(encoded_vect, x)

In [154]:
#supponiamo di avere tre pazienti con EHRs di lunghezze diverse 
#vogliamo applicare CBOW considerando il contesto K (user-defined) separatamente per ogni "sentence",
#cioe' separatamente per ogni paziente
#Creiamo one list with different arrays storing the tokens of the medical terms
# one array for each patient
P1 = "A wiki is run using wiki software, otherwise known as a wiki engine.\
         A wiki engine is a type of content management system, but it differs\
         from most other such systems, including blog software, in that the\
         content is created without any defined owner or leader, and wikis have\
         little implicit structure, allowing structure to emerge according to the\
         needs of the users."

P2 = "The online encyclopedia project Wikipedia is by far the most popular wiki-based\
         website, and is one of the most widely viewed sites of any kind in the world,\
         having been ranked in the top ten since 2007."

P3 = "Wikipedia is not a single wiki but rather a collection of hundreds of wikis,\
         one for each language. There are tens of thousands of other wikis in use, both\
         public and private, including wikis functioning as knowledge management resources,\
         notetaking tools, community websites and intranets. The English-language Wikipedia\
         has the largest collection of articles; as of September 2016, it had over five\
         million articles."

##ARRAY with tokens from patients
H = [P1.split(), P2.split(), P3.split()]

##Fun1: creiamo il nostro dictionary (associamo univocamente un token a un intero)
##Input: array of sentences
def create_dict(array_sentences): 
    word_to_ix = {}
    ix_to_word = {}
    
    for _, j in enumerate(array_sentences):
        for word in j:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                ix_to_word[len(ix_to_word)] = word
    return word_to_ix, ix_to_word

word_to_ix, ix_to_word = create_dict(H)

data = [[word_to_ix[mt] for mt in a] for _, a in enumerate(H)]

vocab_size = len(word_to_ix)
embedding_dim = 10
kernel_size = 3
k_max = 10

model = ehrModel(vocab_size, embedding_dim, kernel_size, k_max)

optimizer = torch.optim.Adam(
    model.parameters(), lr=0.001, weight_decay=1e-5)

##in order to compute the gradient with respect to both the input and the output:
def mse_loss(input, target):
    return torch.sum((input - target)**2) / input.data.nelement()

a = torch.tensor([[1,2,3],[1,2,3]])
a.shape

for epoch in range(5):
    for i, d in enumerate(data):
        print("vect {0:1d}".format(i))
        embeds, out = model(torch.tensor(d))
    
        MSE_loss = mse_loss(embeds, out.view(embeds.shape[0], -1))
        print("MSE loss: {0}".format(MSE_loss))
    
        optimizer.zero_grad()
        MSE_loss.backward()
        optimizer.step()

In [24]:
model.eval()

with torch.no_grad():
    
    for epoch in range(5):
        test_loss = 0.0
        for d in testData:
            pre, out = model(torch.tensor(d))
            loss = mse_loss(pre, out)
            test_loss += loss.item()
        test_loss = test_loss/len(testData)

NameError: name 'testData' is not defined

In [201]:
test_loss

0.09958014823496342

In [197]:
T1 = "A wiki engine is a type of content management system, but it differs\
         from most other such systems, including blog software, in that the\
         content is created without any defined owner or leader, and wikis have"
    
T2 = "one for each language. There are tens of thousands of other wikis in use, both\
         public and private, including wikis functioning as knowledge management resources,\
         notetaking tools, community websites and intranets. The English-language Wikipedia"

In [198]:
T = [T1.split(), T2.split()]
testData = [[word_to_ix[mt] for mt in a] for _, a in enumerate(T)]

In [131]:
prova = torch.tensor([[[1,2,3], [4,5,6]]])

In [132]:
prova

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]]])

In [133]:
prova.shape

torch.Size([1, 2, 3])

In [134]:
prova = prova.view(-1, 3)

In [135]:
prova

tensor([[ 1,  2,  3],
        [ 4,  5,  6]])

In [137]:
prova.t_()

tensor([[ 1,  4],
        [ 2,  5],
        [ 3,  6]])

In [138]:
prova

tensor([[ 1,  4],
        [ 2,  5],
        [ 3,  6]])