## Dataset and DataLoader
We create two dictionaries, the first stores `{mrn:[patient_sequence]}`, the second is of the form `{'train':[mrns], 'test':[mrns]}`.

In [1]:
import pandas as pd
import os
import re
import csv
from collections import OrderedDict, defaultdict
import random
import numpy as np
import torch
from torch.utils import data
from torch.utils.data.dataset import Dataset
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DATA_PATH = os.path.expanduser("~/data1/multMyeloma/data/")
EHR_FILENAME = "MMehr_data.csv"

In [3]:
##Read the dictionaries of medical terms
with open(os.path.join(DATA_PATH, 'rxnorm_to_ix.csv'), 'r') as infile:
    file = csv.reader(infile, delimiter=',')
    rxnorm_to_ix = {}
    for el in file:
        rxnorm_to_ix[el[0]] = int(el[1])
        
with open(os.path.join(DATA_PATH, 'ix_to_rxnorm.csv'), 'r') as infile:
    file = csv.reader(infile, delimiter=',')
    ix_to_rxnorm = {}
    for el in file:
        ix_to_rxnorm[int(el[0])] = el[1]

In [4]:
##Read the data
with open(os.path.join(DATA_PATH, 'mrn_unique.csv'), 'r', newline='') as f1:
    rows = csv.reader(f1, delimiter=",", quotechar='"')
    mrn_unique = [int(row[0]) for row in rows]

with open(os.path.join(DATA_PATH, 'patient_sequences.csv'), 'r', newline='') as f2:
    rows = csv.reader(f2, delimiter=',', quotechar='"')
    patient_sequences = []
    for row in rows:
        patient_sequences += [[int(row[i]) for i in range(len(row))]]

In [5]:
ehrs = {}

for idp in range(len(mrn_unique)):
    ehrs[mrn_unique[idp]] = patient_sequences[idp]

In [6]:
random.seed(42)
traintest = {'train':[], 'test':[]}
test_idx = random.sample(range(len(mrn_unique)), int(len(mrn_unique) * 0.2))

for i in range(len(mrn_unique)):
    if i in test_idx:
        traintest['test'].append(mrn_unique[i])
    else:
        traintest['train'].append(mrn_unique[i])

In [7]:
class MMdata(Dataset):
    def __init__(self, list_mrn, dic_seq):
        self.list_mrn = list_mrn
        self.dic_seq = dic_seq
        
    def __getitem__(self, index):
        
        mrn_idx = self.list_mrn[index]
        sequence = self.dic_seq[mrn_idx]
        #sequence = torch.tensor(sequence)
        
        #print(type(sequence), type(mrn_idx))
        return sequence, mrn_idx

    def __len__(self):
        return len(self.list_mrn)
    
def my_collate(batch):
    data = [item[0] for item in batch]
    data = torch.tensor(data)
    target = [item[1] for item in batch]
    target = torch.LongTensor(target)
    return [data, target]

## Training and testing

In [8]:
##import the model class ehrModel()
%run ehrStratModel.ipynb

In [9]:
# By default, each worker will have its PyTorch seed set to base_seed + worker_id, where base_seed is a long generated by main process using its RNG. 
params = {'batch_size': 5,
          'shuffle': True}
max_epochs = 10

torch.manual_seed(12)
torch.cuda.manual_seed(12)
np.random.seed(12)
random.seed(12)

training_set, test_set = MMdata(traintest['train'], ehrs), MMdata(traintest['test'], ehrs)
training_generator = data.DataLoader(training_set, **params, collate_fn=my_collate)
test_generator = data.DataLoader(test_set, **params, collate_fn=my_collate)

In [10]:
#p = next(iter(test_generator))

In [11]:
len(rxnorm_to_ix)

14081

We have an input of size $(N, 1, T)$, where $N$ is the _batch size_, and $T=65979$ is the length of the ehr sequence for all patients.

In [14]:
vocab_size = len(rxnorm_to_ix)
embedding_dim = 100
kernel_size = 5

device = torch.device('cuda')
model = ehrModel(vocab_size, embedding_dim, kernel_size)
#model = torch.nn.DataParallel(model, device_ids=[0, 2, 3])
print(model.cuda())
model.to(device)

optimizer = torch.optim.Adam(
    model.parameters(), lr=0.001, weight_decay=1e-5)

##in order to compute the gradient with respect to both the input and the output:
def mse_loss(input, target):
    return torch.sum((input - target)**2) / input.data.nelement()

for epoch in range(max_epochs):
    for batch, mrns in training_generator:
        batch = batch.to(device, torch.long)
        #print("vect {0:1d}".format(i))
        
        optimizer.zero_grad()
        input_mat, out, encoded_vect = model(batch)
        print(out[1:3])
        print(out.shape)
        print(batch[1:3])
        mlml = nn.MultiLabelMarginLoss()
        result = mlml(out, batch)
        #CEloss = nn.CrossEntropyLoss()
        #result = CEloss(out, batch)
        #MSE_loss = mse_loss(batch, out)
#         print("MSE loss: {0}".format(MSE_loss))
        print("CE loss: {0}".format(result))
    
#         MSE_loss.backward()
        result.backward()
        optimizer.step()

ehrModel(
  (embedding): Embedding(14081, 100, padding_idx=0)
  (cnn): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
  (cnn2): Conv1d(1, 1, kernel_size=(5, 1), stride=(1,), padding=(2, 0))
)
Shape original mat: torch.Size([5, 65979])
Size embedding matrix: torch.Size([5, 65979, 100])
Embedding reshaping dimension: torch.Size([5, 1, 65979, 100])
Output cnn: torch.Size([5, 1, 65979, 1])
Dimension after first maxpooling: torch.Size([5, 1, 13196, 1])
After Convolution dimension: torch.Size([5, 1, 2640, 1])
Reshaping dimension: torch.Size([5, 2640])
Output from CNN: torch.Size([5, 2640])
Reshaping dimension: torch.Size([5, 2640])
AE input vector: 2640
65979
AE output dimension: torch.Size([5, 65979])
Dimension of the encoded vector: torch.Size([5, 660])
tensor(1.00000e-02 *
       [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  1.4080,  0.0281],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  1.5011,  0.0000]], device='cuda:0')
torch.Size([5, 65979])
tensor([[     0,   

KeyboardInterrupt: 

In [None]:
model.cuda()

In [None]:
net.cuda()