In [1]:
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

In [2]:
mimic_data = pd.read_csv("data/text_binary.csv")
mimic_data.head()

Unnamed: 0,study_id,subject_id,text,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,58522792,16567081,"b"" FINAL REPOR...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,58213163,16567081,b' FINAL REPOR...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,59835582,16043746,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,51487790,16456872,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,59750073,16824069,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
binary_data = mimic_data[['text','No Finding']]
binary_data.text = binary_data.text.str.lower()
binary_data.head()

Unnamed: 0,text,No Finding
0,"b"" final repor...",0.0
1,b' final repor...,0.0
2,b' final repor...,1.0
3,b' final repor...,1.0
4,b' final repor...,1.0


In [4]:
binary_data.groupby('No Finding').count()

Unnamed: 0_level_0,text
No Finding,Unnamed: 1_level_1
0.0,152372
1.0,75455


In [5]:
binary_data.text[0]

'b"                                 final report\\n type of examination:  chest pa and lateral.\\n \\n indication:  ___-year-old male patient with recent pneumonia diagnosed and\\n treated at another facility.  x-ray not available, now with continued cough\\n and wheeze, history of copd, remaining evidence of pneumonia?\\n \\n findings:  pa and lateral chest views were obtained with patient in upright\\n position.  analysis is performed in direct comparison with the next preceding\\n chest examination of ___.  the heart size remains normal.  no\\n typical configurational abnormality is seen.  the thoracic aorta is moderately\\n widened and somewhat elongated but no local contour abnormalities are\\n identified.  the pulmonary vasculature is not congested.  there exists,\\n however, some irregular peripheral vascular distribution most marked on the\\n bases and coinciding with some slightly hypertranslucent pulmonary areas and\\n flattened low positioned diaphragms are indicative of cop

In [8]:
y = binary_data['No Finding']
X = binary_data.drop(columns=['No Finding'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [9]:
X_train.head()

Unnamed: 0,text
0,b' final repor...
1,b' final repor...
2,b' final repor...
3,b' final repor...
4,b' final repor...


In [8]:
# remove all '\\n' from the text
re_newlines = re.compile('\\\\n')
def sub_newlines(x): return re_newlines.sub('',x)

# remove all special characters from the text, keep only alphanumeric and spaces
re_letters = re.compile('[^A-Za-z0-9 ]')
def sub_letters(x): return re_letters.sub('', x)

# remove excessive spacing otherwise you end up with " " substrings
re_spaces = re.compile('\s+')
def sub_spaces(x): return re_spaces.sub(' ', x)
                
# tokenize all words.
my_tok = spacy.load('en')
def spacy_tok(x): 
    return [tok.text for tok in my_tok.tokenizer(sub_spaces
                                                 (sub_letters
                                                 (sub_newlines(x))))]

In [9]:
sub_spaces(sub_letters(sub_newlines(binary_data.text[0])))

'b final report type of examination chest pa and lateral indication yearold male patient with recent pneumonia diagnosed and treated at another facility xray not available now with continued cough and wheeze history of copd remaining evidence of pneumonia findings pa and lateral chest views were obtained with patient in upright position analysis is performed in direct comparison with the next preceding chest examination of the heart size remains normal no typical configurational abnormality is seen the thoracic aorta is moderately widened and somewhat elongated but no local contour abnormalities are identified the pulmonary vasculature is not congested there exists however some irregular peripheral vascular distribution most marked on the bases and coinciding with some slightly hypertranslucent pulmonary areas and flattened low positioned diaphragms are indicative of copd when direct comparison is made with the previous examination of there is a hazy mild degree of density in the left 

In [10]:
spacy_tok(binary_data.text[0])[1:]

['final',
 'report',
 'type',
 'of',
 'examination',
 'chest',
 'pa',
 'and',
 'lateral',
 'indication',
 'yearold',
 'male',
 'patient',
 'with',
 'recent',
 'pneumonia',
 'diagnosed',
 'and',
 'treated',
 'at',
 'another',
 'facility',
 'xray',
 'not',
 'available',
 'now',
 'with',
 'continued',
 'cough',
 'and',
 'wheeze',
 'history',
 'of',
 'copd',
 'remaining',
 'evidence',
 'of',
 'pneumonia',
 'findings',
 'pa',
 'and',
 'lateral',
 'chest',
 'views',
 'were',
 'obtained',
 'with',
 'patient',
 'in',
 'upright',
 'position',
 'analysis',
 'is',
 'performed',
 'in',
 'direct',
 'comparison',
 'with',
 'the',
 'next',
 'preceding',
 'chest',
 'examination',
 'of',
 'the',
 'heart',
 'size',
 'remains',
 'normal',
 'no',
 'typical',
 'configurational',
 'abnormality',
 'is',
 'seen',
 'the',
 'thoracic',
 'aorta',
 'is',
 'moderately',
 'widened',
 'and',
 'somewhat',
 'elongated',
 'but',
 'no',
 'local',
 'contour',
 'abnormalities',
 'are',
 'identified',
 'the',
 'pulmonary',

In [11]:
binary_train = binary_data.copy()
binary_train.head()

Unnamed: 0,text,No Finding
0,"b"" final repor...",0.0
1,b' final repor...,0.0
2,b' final repor...,1.0
3,b' final repor...,1.0
4,b' final repor...,1.0


In [12]:
def get_counts(text):
    counts = Counter()
    for word in text:
        counts.update(spacy_tok(word)[1:])
    return counts

In [13]:
word_count = get_counts(binary_train.text)

In [14]:
len(word_count.keys())

37093

In [15]:
for word in list(word_count):
    if word_count[word] < 3:
        del word_count[word]

In [16]:
len(word_count.keys())

15527

In [17]:
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [18]:
len(words)

15529

In [19]:
def encode_sentence(text):
    enc = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in text.split()])
    return enc

In [20]:
encode_sentence(binary_train.text[0])

array([  1,   2,   1,   4,   5,   1,   7,   8,   9,   1,   1,   1,   1,
        13,  14,  15,  16,  17,  18,   1,  19,  20,  21,   1,   1,  24,
         1,  26,  15,  27,   1,   9,   1,  30,   5,   1,  32,  33,   5,
         1,   1,   1,   8,   9,  10,   7,  35,  36,  37,  15,  14,  38,
         1,   1,  41,  42,  43,  38,  44,  45,  15,  46,  47,   1,   7,
         6,   5,   1,  46,  49,  50,  51,   1,   1,  54,  55,  56,  42,
         1,  46,  58,  59,  42,   1,  61,   9,  62,  63,  64,  53,  65,
        66,  67,   1,   1,  46,  70,  71,  42,  24,   1,  73,   1,   1,
        76,  77,  78,  79,  80,  81,  82,  83,   1,  84,   9,  85,  15,
        76,  86,  87,  70,  88,   1,  89,  90,  91,  92,  68,  93,   5,
         1,  94,   1,  45,  42,  95,  15,  46,  96,   6,   5,   1,  73,
        42,  97,  98,   1, 100,   5, 101,  38,  46, 102, 103, 104,  97,
       105,  65, 106,   1,  46, 107, 108,   5,  46, 109, 110,  83,  46,
       102,   1, 112, 113,   1, 114,  15,  76, 115, 116, 117, 11

## Dataset and data loaders

In [21]:
class Binary_Mimic(Dataset):
    def __init__(self, X, y, vocab):
        self.x = [encode_sentence(x) for x in X.text]
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [22]:
b_train = Binary_Mimic(X_train, y_train, vocab2index)
b_valid = Binary_Mimic(X_val, y_val, vocab2index)

In [23]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (sentence, label). 
            - list of word indices of variable length
            - label, 0 or 1
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sentences, labels = zip(*data)
    
    # stack labels
    labels = torch.Tensor(labels)
    
    # Merge sentences
    lengths = [len(s) for s in sentences]
   
    sents = torch.zeros(len(sentences), max(lengths)).long()
    for i, s in enumerate(sentences):
        end = lengths[i]
        sents[i, :end] = torch.Tensor(s[:end])        
    
    return sents, lengths, labels

In [24]:
data = [b_train[0], b_train[1], b_train[2]]
data

[(array([   1,    2,    1,    1,    7,    1,    1,    1,    1,    1,  156,
          157,  517,   15,    1,    1,  565,    1,    1, 1147,    1,    1,
         7051, 1956,    5,   70,    1,    1,    1,    1,    1,  119,  250,
          165,    1,    7,    1,   73,  132,  310,  318,    1,  319,   38,
           46,  324,    5,   46,    7, 1798,  190,  817,  300, 2071,    1,
           46,  238,    1]), 1.0),
 (array([   1,  283,    1,    1,    1,    1,    1,    1,   53,  191,  204,
            1,   53,   33,    5,  191, 1396,   38,   46,    1,    1, 1002,
          190,  844,  741,    1, 1658,  844,  194,  468,    1,    1,  283,
          284,  510,    1,    1,    1,    1,    1,    1,   99,  387,   38,
           46,  699,  168,  238,  103,  442,  443,    1,   64,    1,  485,
          494,  153,   17,   38,   46,  168,  144,    1,    1,    2,    1,
            1,    7,    1,    1,    1,    1,    1,   15, 1490,   16,    1,
         1351, 1752,    1,  238,    1,   83, 1004,    1,  641,   

In [25]:
collate_fn(data)

(tensor([[   1,  283,    1,    1,    1,    1,    1,    1,   53,  191,  204,    1,
            53,   33,    5,  191, 1396,   38,   46,    1,    1, 1002,  190,  844,
           741,    1, 1658,  844,  194,  468,    1,    1,  283,  284,  510,    1,
             1,    1,    1,    1,    1,   99,  387,   38,   46,  699,  168,  238,
           103,  442,  443,    1,   64,    1,  485,  494,  153,   17,   38,   46,
           168,  144,    1,    1,    2,    1,    1,    7,    1,    1,    1,    1,
             1,   15, 1490,   16,    1, 1351, 1752,    1,  238,    1,   83, 1004,
             1,  641,    1,    1,    1,    1,    7,    8,    9,    1,    1,    1,
            45,   42,   95,    7,  194,  182,    1,    1,    1,    1,    1,    1,
             1,   46,  200,   68,  461,  492,    9,    1,   73,   42,   53,  139,
           163,    1,    1,   46,  224,  225,   42,    1,   53,    1,    5,  191,
          1396,   42,   57,   38,   46,    1,    1,    1,    1,    1,   53,  191,
           204, 

In [26]:
train_loader = DataLoader(b_train, batch_size=2, shuffle=True, collate_fn=collate_fn)
sents, lengths, labels = next(iter(train_loader))
sents

tensor([[    1,     2,     1,     1,     7,     1,     9,     1,     1,     1,
             1,    15,     1,     1,   189,   190,   191,     1,     1,     1,
             7,     8,     9,     1,     1,     1,     1,     1,     1,     1,
             1,    46,    49,     1,     1,     9,   198,   199,    68,     1,
            46,   200,     1,   202,   211,   139,     1,   212,     1,   153,
             1,     1,     1,     1,     1,    53,   191,   204,     1],
        [    1,     2,     1,     1,     7,     1,     1,     1, 11398,   452,
             1,  1642,     1,     1,     1,     7,     8,     9,     1,     1,
             1,     1,     1,     1,     1,     1,    49,    50,    42,     1,
           224,   225,     9,   198,   199,     1,     1,   200,    68,     1,
           139,   780,    68,   202,   211,   163,     1,     1,     1,     1,
             1,     1,    52,     7,     1,     0,     0,     0,     0]])

In [27]:
sents.shape, labels.shape

(torch.Size([2, 59]), torch.Size([2]))

In [28]:
lengths

[59, 55]

## Training

In [29]:
def train_epocs(model, optimizer, train_dl, valid_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long()#.cuda()
            y = y.float()#.cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        print("Epoch #%.f: train loss %.3f val loss %.3f and val accuracy %.3f" % 
              (i+1,sum_loss/total, val_loss, val_acc))

In [30]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long()#.cuda()
        y = y.float().unsqueeze(1)#.cuda()
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

## Basic GRU Model

In [31]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, lengths):
        print(x.shape)
        x = self.embeddings(x)
        x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        out_pack, ht = self.gru(pack)
        return self.linear(ht[-1])

In [32]:
batch_size = 50000
train_dl = DataLoader(b_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(b_valid, batch_size=batch_size, collate_fn=collate_fn)

In [33]:
vocab_size = len(words)
print(vocab_size)
model = GRUModel(vocab_size, 50, 50)#.cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

15529


In [None]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

## Use Glove to create pre-trained embedding 

In [34]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [35]:
word_vecs = loadGloveModel()

In [38]:
word_count = get_counts(binary_train.text)

In [39]:
print(len(word_vecs.keys()), len(word_count.keys()))

400000 37093


In [40]:
def delete_rare_words(word_vecs, word_count, min_df=4):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

In [41]:
word_count = delete_rare_words(word_vecs, word_count, min_df=3)
print(len(word_count.keys()))

20222


In [42]:
def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

In [43]:
pretrained_weight, vocab, vocab2index = create_embedding_matrix(word_vecs, word_count, min_df=3)

In [48]:
len(pretrained_weight)

20224

In [49]:
emb_size = 50
V = len(pretrained_weight)
emb = nn.Embedding(V, emb_size)
emb.weight.data.copy_(torch.from_numpy(pretrained_weight))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1946, -0.1763, -0.2316,  ..., -0.0365,  0.2413,  0.1304],
        [-0.4124,  0.6493, -0.5585,  ...,  0.2621,  0.1045, -0.4430],
        ...,
        [-0.1321, -0.3749, -0.5517,  ...,  0.9329,  0.3362, -0.2008],
        [ 0.8547,  0.1916,  0.0963,  ...,  0.1707, -0.2616,  0.5350],
        [-0.0111, -1.1496, -0.5092,  ...,  0.0186, -0.9912,  0.1102]])

# GRU Model with Pretrained embedding layer:

In [None]:
class GRUModel_pre(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights=None) :
        super(GRUModel_pre,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x, lengths):
        x = self.embeddings(x)
        x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        out_pack, ht = self.gru(pack)
        return self.linear(ht[-1])

In [None]:
vocab_size = len(words)
model = GRUModel_pre(vocab_size, 50, 50, glove_weights=pretrained_weight)#.cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [None]:
train_epocs(model, optimizer,train_dl, valid_dl, epochs=5)

## Basic LSTM

In [55]:
class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTM,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        print(x.shape)
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [62]:
def train_epocs_v0(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long()#.cuda()
            y = y.float()#.cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [63]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long()#.cuda()
        y = y.float().unsqueeze(1)#.cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [64]:
batch_size = 50000
train_dl = DataLoader(b_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(b_valid, batch_size=batch_size, collate_fn=collate_fn)

In [65]:
vocab_size = len(words)
print(vocab_size)
model = LSTM(vocab_size, 50, 50)

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

15529


In [66]:
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=10)

torch.Size([50000, 527])


KeyboardInterrupt: 