In [1]:
import csv

with open('../../neurodev-disorder/cohort-new_vocab.csv') as f:
    vocab_ndd = {}
    rd = csv.reader(f)
    next(rd)
    for r in rd:
        vocab_ndd[r[1]] = r[0]
with open('../../ehr100k/cohort-new_vocab.csv') as f:
    vocab_ehr100 = {}
    rd = csv.reader(f)
    next(rd)
    for r in rd:
        vocab_ehr100[r[0]] = r[1]

print("Length neurodevelopmental disorder vocab: %d" % len(vocab_ndd))
print("Length ehr 100k vocab: %d" % len(vocab_ehr100))

k_ndd = set(vocab_ndd.values())
k_ehr100 = set(vocab_ehr100.keys())

int_term = list(k_ehr100.intersection(k_ndd))

print("Intersection dimension: %d" % len(int_term))

print("Number of dropped terms: %d" % (len(vocab_ndd) - len(int_term)))

Length neurodevelopmental disorder vocab: 12256
Length ehr 100k vocab: 30016
Intersection dimension: 11688
Number of dropped terms: 568


In [2]:
with open('../../neurodev-disorder/cohort-new_ehr.csv') as f:
    rd = csv.reader(f)
    ndd_ehr = []
    mrns = []
    for r in rd:
        ndd_ehr.append(list(map(int, r[1::])))
        mrns.append(r[0])
        
new_ehr = []
for ehr in ndd_ehr:
    tmp = []
    for e in ehr:
        if vocab_ndd[str(e)] in vocab_ehr100.keys():
            tmp.append(vocab_ehr100[vocab_ndd[str(e)]])
    new_ehr.append(tmp)

In [3]:
with open('../../ehr100k/eval_neurodev/cohort-eval_ehr.csv', 'w') as f:
    wr = csv.writer(f)
    for mrn, ehr in zip(mrns, new_ehr):
        wr.writerow([mrn] + [e for e in ehr])

eval_vocab = {}
for t in vocab_ndd:
    if vocab_ndd[t] in vocab_ehr100.keys():
        eval_vocab[vocab_ndd[t]] = vocab_ehr100[vocab_ndd[t]]

with open('../../ehr100k/eval_neurodev/cohort-eval_vocab.csv', 'w') as f:
    wr = csv.writer(f)
    for s, t in eval_vocab.items():
        wr.writerow([s, t])

In [4]:
"""
Evaluate representation model
"""
import torch
import numpy as np
from model.data_loader import EHRdata, ehr_collate
import model.net as net
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import utils as ut
import csv
import os

def evaluate(model, loss_fn, data_iter, metrics, best_eval=False):
    model.eval()
    summ = []
    encoded_list = []
    mrn_list = []

    with torch.no_grad():
        for idx, (list_batch, list_mrn) in enumerate(data_iter):
            for batch, mrn in zip(list_batch, list_mrn):
                batch = batch.cuda()
                out, encoded = model(batch)
                loss = loss_fn(out, batch)
                out.cpu()
                encoded.cpu()
                summary_batch = {metric: metrics[metric](out, batch).item()
                                 for metric in metrics}
                summary_batch['loss'] = loss.item()
                summ.append(summary_batch)
                if best_eval:
                    encoded_list.append(
                        np.mean(encoded.tolist(), axis=0).tolist())
                    mrn_list.append(mrn)
        metrics_mean = {metric: np.mean(
            [x[metric] for x in summ]) for metric in summ[0]}
        metrics_string = " -- ".join("{}: {:05.3f}".format(k.capitalize(), v)
                                     for k, v in sorted(metrics_mean.items()))
        print(metrics_string)

        return mrn_list, encoded_list, metrics_mean

In [9]:
"""
Test ehr100k model on neurodevelopmental disorders dataset
"""

def learn_patient_representations(indir,
                                  outdir,
                                  eval_baseline=False,
                                  sampling=None):

    # get the vocabulary size
    fvocab = os.path.join(indir, 'cohort-eval_vocab.csv')
    with open(fvocab) as f:
        rd = csv.reader(f)
        next(rd)
        vocab_size = sum(1 for r in rd) + 1
    print('Vocabulary size: {0}'.format(vocab_size))

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data = EHRdata(indir, 'cohort-eval_ehr.csv', sampling)
    data_generator = DataLoader(
        data,
        ut.model_param['batch_size'],
        shuffle=True,
        collate_fn=ehr_collate)

    print('Cohort Size: {0} -- Max Sequence Length: {1}\n'.format(
        len(data), ut.len_padded))

    # define model and optimizer
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    model = torch.load('best_model.pt')

    # model.cuda()
    loss_fn = net.criterion
    mrn, encoded, metrics_avg = evaluate(model, loss_fn, data_generator,
                                         net.metrics)
    print(encoded)
    # save results

    # encoded vectors (representations)
    outfile = os.path.join(outdir, 'NDDencoded_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerows(encoded)

    # MRNs to keep track of the order
    outfile = os.path.join(outdir, 'NDDmrns.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        for m in mrn:
            wr.writerow([m])

    # metrics (loss and accuracy)
    outfile = os.path.join(outdir, 'NDDmetrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    return

In [10]:
indir = os.path.expanduser('~/data1/stratification_ILRM/data/ehr100k/eval_neurodev')
outdir = os.path.expanduser('~/data1/stratification_ILRM/data/experiments/ehr100k-2019-03-14-05-09-15-nobn-noact-norelu-10-l64')

learn_patient_representations(indir, outdir)

Vocabulary size: 11688
Cohort Size: 13120 -- Max Sequence Length: 64

Learning rate: 0.0001
Batch size: 16
Kernel size: 5

Accuracy: 0.504 -- Loss: 4.275
[]
