In [1]:
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np

In [2]:
# This class generates vocabulary and word2index and index2word on any corpus
class Language():
    def __init__(self, filename, filter_special_characters, word_count_threshold, special_symbols_list, pretrained_word2vec_path=None, word_embedding_dim=None):
        self.contents = self.read_file(filename)
        self.contents = self.normalize_file_contents(self.contents, filter_special_characters)
        self.vocabulary = self.generate_vocabulary(self.contents, word_count_threshold, special_symbols_list)
        self.word2index = self.generate_word2index(self.vocabulary)
        self.index2word = self.generate_index2word(self.vocabulary)
        self.filter_special_characters = filter_special_characters
        self.special_symbols_list = special_symbols_list
        if pretrained_word2vec_path is not None and word_embedding_dim is not None:
            self.word_embedding_dim = word_embedding_dim
            self.word_embeddings = self.initialize_word_vectors(pretrained_word2vec_path)
        else:
            self.word_embedding_dim = None
            self.word_embeddings = None
            
    # Returns contents of a file as list of sentences
    def read_file(self, filename):
        _file = open(filename,'r')
        contents = []
        for line in _file:
            contents.append(line)
        _file.close()
        return contents

    # Lowercase, trim, and remove filter_special_characters 
    def normalize_file_contents(self, contents, filter_special_characters):
        normalized_contents = []
        for line in contents:
            line = line.lower().strip()
            line = line.translate(None, filter_special_characters)
            normalized_contents.append(line.split())
        return normalized_contents

    # Returns the vocabulary- words below a threshold are dropped and special symbols are added(SOS, EOS)   
    def generate_vocabulary(self, contents, word_count_threshold, special_symbols_list):
        vocab = []
        for special_symbols in special_symbols_list:
            vocab.append(special_symbols)  

        counter = Counter()
        for line in contents:
            counter.update(line)

        for word,count in counter.iteritems():
            if count > word_count_threshold:
                vocab.append(word)
        return vocab

    # maps word to index
    def generate_word2index(self, vocabulary):
        word2index = {}
        for index, word in enumerate(vocabulary):
            word2index[word] = index
        return word2index

    # maps index to word
    def generate_index2word(self, vocabulary):
        index2word = {}
        for index, word in enumerate(vocabulary):
            index2word[index] = word
        return index2word
    
    def initialize_word_vectors(self, pretrained_word2vec_path):
        word2vec_model = KeyedVectors.load_word2vec_format(pretrained_word2vec_path, binary=True)
        word_vectors = np.random.uniform(-0.1, 0.1, (len(self.vocabulary), self.word_embedding_dim))
        for index, word in self.index2word.iteritems():
            if word in word2vec_model:
                word_vectors[index, :] = word2vec_model[word]
        return word_vectors

In [9]:
sourceLanguage = Language(filename= '../training_data/mimic_tr.nl.tem',
                          filter_special_characters= string.punctuation.translate(None, '_'),
                          word_count_threshold= 1,
                          special_symbols_list=['<sos>','<eos>','<unk>'],
                          pretrained_word2vec_path='/scratch/at3577/nli/data/wikipedia-pubmed-and-PMC-w2v.bin', 
                          word_embedding_dim=200)

In [10]:
# had to add unkown in sql vocab--> Check this
targetLanguage = Language(filename= '../training_data/mimic_tr.sql.tem',
                          filter_special_characters= '',
                          word_count_threshold= 1,
                          special_symbols_list= ['<sos>','<eos>','<unk>'])

In [11]:
# This class is used to load the dataset
class Nl2SqlDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source = self.dataframe.iloc[idx, 0]
        target = self.dataframe.iloc[idx, 1]
        
        sample = {'nl': source, 'sql': target}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

# Transformers:
# 1. Lowercase, trim, and remove filter_special_characters
# 2. add SOS, EOS and UNK to the sentence
# 3. converts sentence to index
# 4. to tensor

class Transformer(object):
    
    def __init__(self, sourceLanguage, targetLanguage):
        self.source_language = sourceLanguage
        self.target_language = targetLanguage

    def __call__(self, sample):
        nl, sql = sample['nl'], sample['sql']
        
        # 1. Lowercase, trim, and remove filter_special_characters
        nl  = self.normalize_line(nl, self.source_language.filter_special_characters)
        sql = self.normalize_line(sql, self.target_language.filter_special_characters)
        
        # 2. add SOS, EOS and UNK to the sentence
        nl = self.replace_unknown_words(nl, self.source_language, self.source_language.special_symbols_list[2])
        sql = self.replace_unknown_words(sql, self.target_language, self.target_language.special_symbols_list[2])
        nl = [self.source_language.special_symbols_list[0]] + nl + [self.source_language.special_symbols_list[1]]
        sql= [self.target_language.special_symbols_list[0]] + sql + [self.target_language.special_symbols_list[1]]
        
        # 3. converts sentence to index
        nl  = self.sentence2index(nl, self.source_language)
        sql = self.sentence2index(sql, self.target_language)
        
        # 4. to tensor
        nl = torch.LongTensor(nl)
        sql = torch.LongTensor(sql)
        
        return {'nl': nl, 'sql': sql}
    
    def normalize_line(self, line, filter_special_characters):
        line = line.lower().strip()
        line = line.translate(None, filter_special_characters)
        line = line.split()
        return line

    def replace_unknown_words(self, sentence, language, unknown_symbol):
        for idx, word in enumerate(sentence):
            if word not in language.vocabulary:
                sentence[idx] = unknown_symbol
        return sentence
    
    def sentence2index(self, sentence, language):
        new_sentence = []
        for word in sentence:
            new_sentence.append(language.word2index[word])
        return new_sentence

In [20]:
training_dataset = Nl2SqlDataset('../training_data/mimic_tr.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))
dev_dataset = Nl2SqlDataset('../training_data/mimic_tr.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))
#test_dataset = Nl2SqlDataset('../training_data/geo_test.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))

In [21]:
class Encoder(nn.Module):
    def __init__(self, source_language, lstm_hidden_size, lstm_num_layers, dropout_prob):
        super(Encoder, self).__init__()
        self.num_embeddings = len(source_language.vocabulary)
        self.embedding_dim = source_language.word_embedding_dim
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm_num_layers = lstm_num_layers
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.bilstm = nn.LSTM(input_size=self.embedding_dim, 
                              hidden_size= lstm_hidden_size,
                              num_layers= lstm_num_layers,
                              bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)
        
        self.initialize_embeddings(source_language.word_embeddings)
        
    def initialize_embeddings(self, initial_word_embeddings):
        self.embedding.weight.data.copy_(torch.from_numpy(initial_word_embeddings))
    
    def forward(self, source_language_sentence):
        
        embedded  = self.embedding(source_language_sentence)
        embedded  = self.dropout(embedded)
        
        seq_len = len(source_language_sentence)
        bilstm_input = embedded.view(seq_len, 1, self.embedding_dim)
        
        output, (hidden_state, cell_state) = self.bilstm(bilstm_input)
        
        return output, hidden_state, cell_state

In [22]:
class AttnDecoder(nn.Module):
    def __init__(self, target_language, encoder, target_embedding_dim, lstm_num_layers, dropout_prob):
        super(AttnDecoder, self).__init__()
        self.num_embeddings = len(target_language.vocabulary)
        self.embedding_dim = target_embedding_dim
        self.lstm_hidden_size = encoder.lstm_hidden_size
        self.lstm_num_layers = lstm_num_layers
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.bilstm = nn.LSTM(input_size=self.embedding_dim + encoder.lstm_hidden_size*2,
                              hidden_size=self.lstm_hidden_size,
                              num_layers= lstm_num_layers,
                              bidirectional=True)
        self.output_layer = nn.Linear(self.lstm_hidden_size*2 + encoder.lstm_hidden_size*2, len(target_language.vocabulary))
        self.dropout = nn.Dropout(dropout_prob)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, target_language_word, hidden_state, cell_state, encoder_outputs):
        
        #1 Embedding layer
        embedded = self.embedding(target_language_word)
        embedded  = self.dropout(embedded)
        bilstm_input = embedded.view(1, 1, self.embedding_dim)
       
        #2 Attention layer
        attn_weights = self.attn(hidden_state, encoder_outputs) #(1, 1, seqlen)
        context = torch.bmm(attn_weights, torch.transpose(encoder_outputs, 0, 1)) #(1,1,2*encoder_lstm_hidden_size)
        
        #3 Bilstm
        bilstm_input = torch.cat((bilstm_input, context), 2)
        bilstm_output, (hidden_state, cell_state) = self.bilstm(bilstm_input, (hidden_state, cell_state))
        
        #4 Output Layer
        output_layer_input = torch.cat((bilstm_output, context), 2)
        output_layer_input = torch.squeeze(output_layer_input, 0) #(1, self.lstm_hidden_size*2 + encoder.lstm_hidden_size*2)
        output = self.softmax(self.output_layer(output_layer_input)) #(1, len(target_language.vocabulary))
        
        return output, hidden_state, cell_state 
    
    def attn(self, hidden_state, encoder_outputs):
        seqlen = len(encoder_outputs)
        attn_energies = Variable(torch.zeros(seqlen))
        
        for i in range(seqlen):
            attn_energies[i] = torch.dot(hidden_state.view(1,-1), encoder_outputs[i])
        
        return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)

In [23]:
def train(source_language_sentence, target_language_sentence, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip_gradient):
    # set training to true for dropout layers
    encoder.train()
    decoder.train()
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # Get size of input and target sentences
    input_length = source_language_sentence.size()[0]
    target_length = target_language_sentence.size()[0]
    
    # Run words through encoder
    encoder_outputs, encoder_hidden_state, encoder_cell_state = encoder(source_language_sentence)
    # encoder_outputs -> (input_length, 1, 2*encoder_hidden_size)
    # encoder_hidden_state -> (2, 1, encoder_hidden_size)

    # Prepare decoder input and output
    decoder_hidden_state = encoder_hidden_state
    decoder_cell_state = Variable(torch.zeros(decoder_hidden_state.shape[0], decoder_hidden_state.shape[1], decoder_hidden_state.shape[2]))
    
    #only using teacher forcing for now --> Check this
    loss = 0
    for i in range(target_length-1):
        decoder_output, decoder_hidden_state, decoder_cell_state = decoder(target_language_sentence[i].view(1,1), decoder_hidden_state, decoder_cell_state, encoder_outputs)
        loss += criterion(decoder_output, target_language_sentence[i+1])
    
    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip_gradient)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip_gradient)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length 

In [24]:
def evaluate(dataset, num_samples, source_language, target_language, encoder, decoder, criterion, verbose=False):
    # set training to false for dropout layers
    encoder.eval()
    decoder.eval()
    
    total_loss = 0
    for n_sample in range(num_samples):
        sample = random.choice(dataset)
        source_language_sentence, target_language_sentence = Variable(sample['nl']), Variable(sample['sql'])
        
        # Get size of input and target sentences
        input_length = source_language_sentence.size()[0]
        target_length = target_language_sentence.size()[0]

        # Run words through encoder
        encoder_outputs, encoder_hidden_state, encoder_cell_state = encoder(source_language_sentence)
        # encoder_outputs -> (input_length, 1, 2*encoder_hidden_size)
        # encoder_hidden_state -> (2, 1, encoder_hidden_size)

        # Prepare decoder input and output
        decoder_hidden_state = encoder_hidden_state
        decoder_cell_state = Variable(torch.zeros(decoder_hidden_state.shape[0], decoder_hidden_state.shape[1], decoder_hidden_state.shape[2]))
        
        loss = 0
        decoder_input = target_language_sentence[0].view(1,1)
        predicted_sentence = []
        for i in range(target_length-1):
            decoder_output, decoder_hidden_state, decoder_cell_state = decoder(decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs)
            loss += criterion(decoder_output, target_language_sentence[i+1])
            
            # Choose top word from output
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            if ni == target_language.word2index['<eos>']:
                break
            
            # Next input is chosen word
            decoder_input = Variable(torch.LongTensor([[ni]]))
            predicted_sentence.append(ni)
        
        total_loss += (loss.data[0] / target_length)
        if verbose:
            print 'Sample ' + str(n_sample)
            print 'Source sentence = ' + str(to_sentence(source_language_sentence.data.numpy(), source_language))
            print 'Target sentence = ' + str(to_sentence(target_language_sentence.data.numpy(), target_language))
            print 'Predicted sentence = ' + str(to_sentence(predicted_sentence, target_language))
            
    return total_loss/num_samples

def to_sentence(index_list ,language):
    sentence = []
    for index in index_list:
        sentence.append(language.index2word[index])
    return sentence

In [25]:
# initialize models, optimizers, and a loss function (criterion).
learning_rate = 0.001
batch_size = 100
num_epochs = 70
clip_gradient = 5.0

encoder_lstm_hidden_size = 100
encoder_lstm_num_layers=1
encoder_dropout_prob = 0.05

decoder_target_embedding_dim=100
decoder_lstm_num_layers=1
decoder_dropout_prob = 0.05

encoder = Encoder(source_language= sourceLanguage, 
                  lstm_hidden_size= encoder_lstm_hidden_size, 
                  lstm_num_layers= encoder_lstm_num_layers,
                  dropout_prob= encoder_dropout_prob)
decoder = AttnDecoder(target_language= targetLanguage, 
                      encoder= encoder, 
                      target_embedding_dim= decoder_target_embedding_dim, 
                      lstm_num_layers= decoder_lstm_num_layers,
                      dropout_prob= decoder_dropout_prob)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [26]:
# Start training
for n_epochs in range(num_epochs):
    training_loss = 0
    for batch in range(batch_size):
        sample = random.choice(training_dataset)
        source_language_sentence, target_language_sentence = Variable(sample['nl']), Variable(sample['sql'])
        training_loss += train(source_language_sentence, target_language_sentence, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip_gradient)
    
    training_loss = training_loss/batch_size
    dev_loss = evaluate(dev_dataset, 10, sourceLanguage, targetLanguage, encoder, decoder, criterion)
    print 'Training loss after ' + str(n_epochs) + ' epoch is ' + str(training_loss) + ' dev loss= ' + str(dev_loss)



Training loss after 0 epoch is 2.24132088039 dev loss= 3.75725344034
Training loss after 1 epoch is 0.626169562493 dev loss= 2.36812647828
Training loss after 2 epoch is 0.350380761856 dev loss= 2.17197119295
Training loss after 3 epoch is 0.244380665842 dev loss= 2.43190895258
Training loss after 4 epoch is 0.139430442641 dev loss= 3.13085800388
Training loss after 5 epoch is 0.120988297409 dev loss= 0.0939236086692
Training loss after 6 epoch is 0.0907304931697 dev loss= 0.0681077098176
Training loss after 7 epoch is 0.08055973015 dev loss= 0.565917118298
Training loss after 8 epoch is 0.0507540254568 dev loss= 0.0185877809504
Training loss after 9 epoch is 0.053658393525 dev loss= 0.220232151594
Training loss after 10 epoch is 0.0497627610749 dev loss= 0.607401155891
Training loss after 11 epoch is 0.0519580220017 dev loss= 0.025421864002
Training loss after 12 epoch is 0.0350311793248 dev loss= 0.0411268949335
Training loss after 13 epoch is 0.0359319721932 dev loss= 0.015483326106

In [27]:
# Evaluating on small sample of test set
test_loss = evaluate(training_dataset, 10, sourceLanguage, targetLanguage, encoder, decoder, criterion, verbose=True)



Sample 0
Source sentence = ['<sos>', 'how', 'many', 'problem_1', 'patients', 'have', 'been', 'tested', 'for', 'test_1', 'below', 'numeric_1', '<eos>']
Target sentence = ['<sos>', 'select', 'count(distinct', 'subject_id)', 'from', 'diagnoses_icd', 'where', 'icd9_code', 'in', '(select', 'distinct', 'icd9_code', 'from', 'd_icd_diagnoses', 'where', 'long_title', 'like', "'%problem_1%')", 'and', 'subject_id', 'in', '(select', 'distinct', 'subject_id', 'from', 'labevents', 'where', 'itemid', 'in', '(', 'select', 'distinct', 'itemid', 'from', 'd_labitems', 'where', 'label', 'like', "'%test_1%')", 'and', 'valuenum', '<', 'numeric_1', ')', '<eos>']
Predicted sentence = ['select', 'count(distinct', 'subject_id)', 'from', 'diagnoses_icd', 'where', 'icd9_code', 'in', '(select', 'distinct', 'icd9_code', 'from', 'd_icd_diagnoses', 'where', 'long_title', 'like', "'%problem_1%')", 'and', 'subject_id', 'in', '(select', 'distinct', 'subject_id', 'from', 'labevents', 'where', 'itemid', 'in', '(', 'select

In [28]:
test_loss

1.9266274811125673e-05

In [None]:
# To start/stop the database server
#pg_ctl -D /scratch/at3577/postgresql/data -l /scratch/at3577/postgresql/logfile start
#pg_ctl stop -D /scratch/at3577/postgresql/data -m smart

In [4]:
from pg import DB
db = DB( host='localhost', user='postgres', dbname='mimic' )
print(db.query('select * from mimiciii.admissions limit 10'))

row_id|subject_id|hadm_id|     admittime     |     dischtime     |     deathtime     |admission_type|   admission_location    |   discharge_location    |insurance|language|    religion     |marital_status|      ethnicity      |     edregtime     |     edouttime     |                       diagnosis                        |hospital_expire_flag|has_chartevents_data
------+----------+-------+-------------------+-------------------+-------------------+--------------+-------------------------+-------------------------+---------+--------+-----------------+--------------+---------------------+-------------------+-------------------+--------------------------------------------------------+--------------------+--------------------
    21|        22| 165315|2196-04-09 12:26:00|2196-04-10 15:54:00|                   |EMERGENCY     |EMERGENCY ROOM ADMIT     |DISC-TRAN CANCER/CHLDRN H|Private  |        |UNOBTAINABLE     |MARRIED       |WHITE                |2196-04-09 10:06:00|2196-04-09 13:24:00|B

In [5]:
db.close()