In [2]:
import torch
import torch.nn as nn
import numpy as np

#### Kalchbrenner, N. 2014
Convolution operation followed by a pooling operation. 

- start with a projected sentence matrix of size $d\times s$, where $d$ is the embedding dimension and $s$ is the number of words ($\mathbf{w}_i$) in the sentence $\mathbf{s}$. The values in the embeddings $\mathbf{w}_i$ are parameters that are optimized during training.  
- Convolutional layer: convolves a matrix of weights $\mathbf{m} \in \mathbb{R}^{d\times m}$ with the matrix at the layer below.
- Resulting matrix $\mathbf{c} \in \mathbb{R}^{d\times (s+m-1)}$.
- $k-\max$ pooling (_dynamic_) is applied. Given a value $k$ and a sequence $\mathbf{p}\in \mathbb{R}^{p}$, with $p\geq k$, $k-\max$ pooling selects the subsequence $\mathbf{p}^k_{\max}$ of the $k$ highest values of $\mathbf{p}$. This guarantees that the input of the fully connected layers is independent of the length of the input sentence.
- a bias $\mathbf{b}\in\mathbb{R}^d$ and a non-linear function $g$ are applied component-wise to the pooled matrix. (_Fully connected layer_)
- Folding: between convolution and pooling in the last layer. Sum of every two rows component-wise in a feature map. Hence, dimension $d$ is halved ($d/2$).

### Vocabulary

In [None]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context] ##word_to_ix is the dictionary with words in text and numbers
    tensor = torch.LongTensor(idxs)
    return tensor

def get_index_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i
        return index

## Model
We can start by considering the whole EHR of a patient as a sentence.
1. embedding
2. CNN: wide convolution with `kMaxPooling`.
3. AE: autoencoder whose code dimension is less than the input dimension (__undercomplete__). We want to learn an undercomplete representation of the training data in order to capture their most salient features. The learning process is described simply as minimizing a loss function
$$
L(\mathbf{x}, g(f(\mathbf{x})))
$$
where L is a loss function penalizing $g(f(\mathbf{x}))$ for being dissimilar from $\mathbf{x}$ (e.g. mean squared error).

The size of the hidden layer after a convolution is $\frac{(W-F+2P)}{S}+1$, where $W$ is the first layer size, $F$ is the filter size, $P$ is the padding and $S$ is the stride. 

In [135]:
class ehrStrat(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_size, word_to_ix, ix_to_word, k_max1, k_max2):
        super(ehrStrat, self).__init__()
        self.k_max1 = k_max1
        self.k_max2 = k_max2
        self.word_to_ix = word_to_ix
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_size = kernel_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.cnn = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=kernel_size, padding=1)
        self.cnn2 = nn.Conv1d(1, 1, kernel_size=(kernel_size, 1), padding=0)

    def forward(self, x):
        embeds = self.embedding(x)
        print(embeds.shape)
        embeds = embeds.view(1, self.embedding_dim, -1)                     
        #embeds = embeds.view(1,1,-1, self.embedding_dim)
        print(embeds, embeds.shape)
        out = self.cnn(embeds)
        print(out, out.shape)
        out = out.view(out.shape[2], out.shape[3])
        print(out, out.shape)
        out = self.kmaxPooling(out, self.k_max1)[0]
        out = out.view(1, 1, self.k_max1, 1)
        out = self.cnn2(out)
        out = out.view(out.shape[2])
        print(out, out.shape)
        out = self.kmaxPooling(out, self.k_max2)[0]
        out = out.view(1, 1, self.k_max2, 1)
        d = out.shape[2] * out.shape[3]
        out = out.view(d)
        print(out.shape)
        ehrAE = ae(d)
        pre_vec = out
        out = ehrAE(out) 
        
        return pre_vec, out
        
    def kmaxPooling(self, x, k):
        return torch.topk(x, k, dim=0, sorted=False) #returns 
    
    def get_word_embedding(self, word):
        word = torch.LongTensor([word_to_ix[word]])
        return self.embedding(word).view(1, -1)
    
    def pat_embedding(self, sentence, emb_dim):
        mat_all = []
        emb = torch.empty(len(sentence), emb_dim)

        for num in sentence:
            for med_term in sentence:
                emb = torch.cat((emb, self.get_word_embedding(ix_to_word[med_term]).detach()),0)
            mat_all += [emb]

        return mat_all
#k-max pooling so the matrix has the same size for each patient
#heredit ae class

In [136]:
t = torch.tensor([1,2,3])

In [137]:
t.shape

torch.Size([3])

The encoder and decoder are still trained together, but once we have the weights, we can use the two separately — maybe we use the encoder to generate a more meaningful representation of some data we’re feeding into another neural network, or the decoder to let the network generate new data we haven’t shown it before.

In [138]:
class ae(nn.Module):
    def __init__(self, d):
        super(ae, self).__init__()
        self.encoder = nn.Sequential(
        nn.Linear(d, 20),
        nn.ReLU(True),
        nn.Linear(20, 10),
        nn.ReLU(True))
        self.decoder = nn.Sequential(
        nn.Linear(10, 20),
        nn.ReLU(True),
        nn.Linear(20, d))
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return(x)

In [139]:
#supponiamo di avere tre pazienti con EHRs di lunghezze diverse 
#vogliamo applicare CBOW considerando il contesto K (user-defined) separatamente per ogni "sentence",
#cioe' separatamente per ogni paziente
#Creiamo one list with different arrays storing the tokens of the medical terms
# one array for each patient
P1 = "A wiki is run using wiki software, otherwise known as a wiki engine.\
         A wiki engine is a type of content management system, but it differs\
         from most other such systems, including blog software, in that the\
         content is created without any defined owner or leader, and wikis have\
         little implicit structure, allowing structure to emerge according to the\
         needs of the users."

P2 = "The online encyclopedia project Wikipedia is by far the most popular wiki-based\
         website, and is one of the most widely viewed sites of any kind in the world,\
         having been ranked in the top ten since 2007."

P3 = "Wikipedia is not a single wiki but rather a collection of hundreds of wikis,\
         one for each language. There are tens of thousands of other wikis in use, both\
         public and private, including wikis functioning as knowledge management resources,\
         notetaking tools, community websites and intranets. The English-language Wikipedia\
         has the largest collection of articles; as of September 2016, it had over five\
         million articles."

In [140]:
##ARRAY with tokens from patients
H = [P1.split(), P2.split(), P3.split()]

In [141]:
##Fun1: creiamo il nostro dictionary (associamo univocamente un token a un intero)
##Input: array of sentences
def create_dict(array_sentences): 
    word_to_ix = {}
    ix_to_word = {}
    
    for _, j in enumerate(array_sentences):
        for word in j:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                ix_to_word[len(ix_to_word)] = word
    return word_to_ix, ix_to_word

In [142]:
word_to_ix, ix_to_word = create_dict(H)

In [143]:
data = [[word_to_ix[mt] for mt in a] for _, a in enumerate(H)]

In [144]:
for sequence in data:
    print(sequence)

[0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 1, 10, 0, 1, 11, 2, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 5, 27, 28, 29, 14, 2, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 45, 29, 48, 13, 29, 49]
[50, 51, 52, 53, 54, 2, 55, 56, 29, 21, 57, 58, 59, 37, 2, 60, 13, 29, 21, 61, 62, 63, 13, 32, 64, 27, 29, 65, 66, 67, 68, 27, 29, 69, 70, 71, 72]
[54, 2, 73, 9, 74, 1, 17, 75, 9, 76, 13, 77, 13, 78, 60, 79, 80, 81, 82, 83, 84, 13, 85, 13, 22, 38, 27, 86, 87, 88, 37, 89, 25, 38, 90, 8, 91, 15, 92, 93, 94, 95, 96, 37, 97, 50, 98, 54, 99, 29, 100, 76, 13, 101, 8, 13, 102, 103, 18, 104, 105, 106, 107, 108]


In [145]:
EMBEDDING_DIM = 10
vocab_size = len(word_to_ix)
model = ehrStrat(len(word_to_ix), 10, 3, word_to_ix, ix_to_word, 20, 10)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

criterion = nn.BCELoss() ##Binary Cross Entropy
optimizer = torch.optim.Adam(
    model.parameters(), lr=0.001, weight_decay=1e-5)

In [146]:
##in order to compute the gradient with respect to both the input and the output:
def mse_loss(input, target):
    return torch.sum((input - target)**2) / input.data.nelement()

In [147]:
for epoc in range(50):
    total_loss = 0
    for sequence in data:
        sequence = torch.tensor(sequence)
        #==========forward=============
        pre_ae, output = model(sequence)
        #pre_ae = pre_ae.detach()
        print(output)
        print(pre_ae) ##is not being optimized!!
        #loss = loss_function(output, pre_ae)
        #print(loss)
        MSE_loss = mse_loss(pre_ae, output)
        print(MSE_loss)
        #==========backward============
        optimizer.zero_grad()
        MSE_loss.backward()
        optimizer.step
        
        total_loss += MSE_loss.data

torch.Size([63, 10])
tensor([[[-0.6643,  0.8762, -0.6143, -0.3364,  0.4487,  1.1991,  0.9402,
           0.1418, -0.7355, -1.1137,  0.3531,  0.1092, -0.8300, -0.0348,
          -0.8249, -0.0847, -0.6956,  0.5716,  1.2761, -0.0599,  2.2704,
          -0.5221,  0.6090,  0.2705, -0.0831,  1.5431,  2.4835,  1.4663,
           0.2456,  0.9603,  0.4076,  0.1794, -0.1108, -0.6419,  0.7720,
          -1.3293,  0.0716, -0.1680, -0.0377,  1.3265,  0.4640, -1.3670,
           1.6538, -0.7660,  0.3306,  0.3221, -0.1960,  0.1538,  1.7378,
          -0.1399,  0.3531,  0.1092, -0.8300, -0.0348, -0.8249, -0.0847,
          -0.6956,  0.5716,  1.2761, -0.0599,  0.5106, -1.2484, -2.3522],
         [-1.8015,  0.6001,  0.3625, -1.1247, -0.8012,  0.0658, -0.7842,
          -0.4731, -0.6165,  0.9291,  0.5663, -1.1135, -1.4214,  0.0254,
           0.9612, -0.9145, -0.3315, -0.3892,  0.2691,  0.7364,  0.3031,
          -0.9939, -1.0225, -0.5356, -1.0026, -0.3885,  0.3667,  0.2567,
           1.4668, -1.5100,  

IndexError: tuple index out of range

In [92]:
len(H[0])

63

In [93]:
model.parameters()

<generator object Module.parameters at 0x7f8132f89780>

In [94]:
m = ae()

TypeError: __init__() missing 1 required positional argument: 'd'

In [None]:
prova = m(x)

In [None]:
class persona(nn.Module):
    def __init__(self, nome, age):
        super(persona, self).__init__()
        self.nome = nome
        self.age = age
        
    def forward(self, x):
        print(x)
        print(self.nome)
        print(self.prova("benegrazie", x))
        ciao = lavoro(3,4)
        print(ciao.vai())
        print()
        
    def prova(self,cosa,x):
        print(x)
        #print("benegrazie")
        return(cosa)
    
class lavoro():
    def __init__(self, anni, tipo):
        self.anni = anni
        self.tipo = tipo
        
    def vai(self):
        return("macche")

In [124]:
ciao = lavoro(3,4)

In [128]:
ciao.vai()

'macche'

In [130]:
matteo = persona(nome="matteo", age=30)

In [131]:
matteo('boh')

boh
matteo
boh
benegrazie
macche



In [78]:
matteo.nome


'matteo'

In [79]:
matteo.age

30

In [80]:
import torch

In [88]:
x = torch.Tensor([[1,2,3,4,5,6], [2,3,4,5,6,7]])

TypeError: new() received an invalid combination of arguments - got (list, list), but expected one of:
 * (torch.device device)
 * (tuple of ints size, torch.device device)
      didn't match because some of the arguments have invalid types: ([31;1mlist[0m, [31;1mlist[0m)
 * (torch.Storage storage)
 * (Tensor other)
 * (object data, torch.device device)
      didn't match because some of the arguments have invalid types: ([31;1mlist[0m, [31;1mlist[0m)


In [85]:
x

tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [ 2.,  3.,  4.,  5.,  6.,  7.]])

In [89]:
x.topk(2, sorted=False)

(tensor([[ 5.,  6.],
         [ 6.,  7.]]), tensor([[ 4,  5],
         [ 4,  5]]))

In [93]:
prova = torch.rand(2,3,2)
print(prova)

tensor([[[ 0.6754,  0.5990],
         [ 0.0938,  0.9188],
         [ 0.4129,  0.1722]],

        [[ 0.6449,  0.0940],
         [ 0.4869,  0.6581],
         [ 0.3957,  0.1160]]])


In [95]:
prova.topk(1)[0]

tensor([[[ 0.6754],
         [ 0.9188],
         [ 0.4129]],

        [[ 0.6449],
         [ 0.6581],
         [ 0.3957]]])

# PROVE

## Embedding

In [123]:
import torch
import torch.nn as nn
import numpy as np
import torchtext

In [124]:
#supponiamo di avere tre pazienti con EHRs di lunghezze diverse 
#vogliamo applicare CBOW considerando il contesto K (user-defined) separatamente per ogni "sentence",
#cioe' separatamente per ogni paziente
#Creiamo one array con different arrays storing the tokens of the medical terms
# one array for each patient
P1 = "A wiki is run using wiki software, otherwise known as a wiki engine.\
         A wiki engine is a type of content management system, but it differs\
         from most other such systems, including blog software, in that the\
         content is created without any defined owner or leader, and wikis have\
         little implicit structure, allowing structure to emerge according to the\
         needs of the users."

P2 = "The online encyclopedia project Wikipedia is by far the most popular wiki-based\
         website, and is one of the most widely viewed sites of any kind in the world,\
         having been ranked in the top ten since 2007."

P3 = "Wikipedia is not a single wiki but rather a collection of hundreds of wikis,\
         one for each language. There are tens of thousands of other wikis in use, both\
         public and private, including wikis functioning as knowledge management resources,\
         notetaking tools, community websites and intranets. The English-language Wikipedia\
         has the largest collection of articles; as of September 2016, it had over five\
         million articles."

In [125]:
##ARRAY with tokens from patients
H = [P1.split(), P2.split(), P3.split()]

In [126]:
##Fun1: creiamo il nostro dictionary (associamo univocamente un token a un intero)
##Input: array of sentences
def create_dict(array_sentences): 
    word_to_ix = {}

    for _, j in enumerate(array_sentences):
        for word in j:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

In [127]:
word_to_ix = create_dict(H)

In [130]:
##Fun2: now we want to create the context on which to train the CBOW, 
##considering separately the sentences (i.e. the patients)        
def create_context(dim_context, array_sentences):
    data = []

    for sentence in array_sentences:
        for i in range(dim_context, len(sentence) - dim_context):
            context = [sentence[j] for j in range(i-dim_context, i-1)] + [sentence[j] for j in range(i + 1, i + dim_context)]
            target = sentence[i]
            data.append((context, target))
    return data

In [131]:
data = create_context(4, H)

In [132]:
##Fun3: this function will translate context into a tensor of integers based on the dictionary
##We will use this function during the CBOW() training
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context] ##word_to_ix is the dictionary with words in text and numbers
    tensor = torch.LongTensor(idxs)
    return tensor

In [133]:
##CBOW class for word embedding
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation2 = nn.LogSoftmax(dim=-1)
        
    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view((1, -1))
        out = self.linear1(embeds)
        out = self.activation1(out)
        out = self.linear2(out)
        out = self.activation2(out)
        return out
    
    def get_word_embedding(self, word):
        word = torch.LongTensor([word_to_ix[word]])
        return self.embeddings(word).view(1, -1)

In [134]:
##TRAINING
EMBEDDING_DIM = 10
vocab_size = len(word_to_ix)
model = CBOW(vocab_size, EMBEDDING_DIM)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [135]:
for epoc in range(50):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word_to_ix)
        model.zero_grad()
        log_probs = model(context_vector)
        loss = loss_function(log_probs, torch.LongTensor([word_to_ix[target]]))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.data

In [136]:
##Fun4: return the array of the embedding tensors, one for each patien
##dimensions of the tensors vary w.r.t patients, although the embedding dimension doesn't vary
##tensor([T_i,d]), where T_i depends on the length of the EHR for patient i and d is the user-defined embedding dimension
def pat_embedding(array_sentences, emb_dim, model):
    mat_all = []

    for sentence in array_sentences:
        emb = torch.empty(len(sentence), emb_dim)
        for med_term in sentence:
            emb = torch.cat((emb, model.get_word_embedding(med_term).detach()),0)
        mat_all += [emb]

    return mat_all

In [137]:
mat_all = pat_embedding(H, EMBEDDING_DIM, model)

In [142]:
##TEST
context = "tens of of other wikis".split()
context_vector = make_context_vector(context, word_to_ix)
pred = model(context_vector).data.numpy()

index = 0
for i in range(pred.shape[1]):
    if pred[0, i] > pred[0, index]:
        index = i
print("find word {0:d}".format(index))

find word 13


In [143]:
word_to_ix.items()

dict_items([('wikis,', 78), ('articles;', 101), ('differs', 19), ('world,', 65), ('language.', 81), ('management', 15), ('not', 73), ('such', 23), ('and', 37), ('largest', 100), ('little', 40), ('is', 2), ('ranked', 68), ('otherwise', 6), ('most', 21), ('A', 0), ('a', 9), ('by', 55), ('public', 88), ('any', 32), ('including', 25), ('2016,', 103), ('the', 29), ('websites', 96), ('structure,', 42), ('structure', 44), ('collection', 76), ('notetaking', 93), ('have', 39), ('had', 104), ('website,', 59), ('million', 107), ('implicit', 41), ('to', 45), ('been', 67), ('wiki', 1), ('system,', 16), ('it', 18), ('systems,', 24), ('thousands', 85), ('using', 4), ('leader,', 36), ('private,', 89), ('single', 74), ('but', 17), ('from', 20), ('for', 79), ('online', 51), ('as', 8), ('created', 30), ('intranets.', 97), ('English-language', 98), ('use,', 86), ('engine', 11), ('hundreds', 77), ('since', 71), ('of', 13), ('over', 105), ('or', 35), ('allowing', 43), ('type', 12), ('owner', 34), ('kind', 6