# LSTM Classification with Attention

We use the dataset and embeddings from a previous problem to demonstrate attention in TensorFlow. 

## Modules:

In [1]:
import pandas as pd
import csv
import collections
import matplotlib.pyplot as plt
from matplotlib import pylab
import numpy as np
import random
import math
import string
import itertools
import tensorflow as tf
from sklearn.manifold import TSNE
import time
import copy
from tqdm import tqdm
import os
import pickle

## Data Import/Initial Processing:

Note: `encoding = "ISO-8859-1"` in `pd.read_csv` deals with UTF errors. 

In [2]:
pre_data = pd.read_csv('Text_Class_Prototype//TextClassification_Data.csv', encoding = "ISO-8859-1")
pre_data.head()

Unnamed: 0,fileid,SUMMARY,DATA,categories,sub_categories,previous_appointment,ID
0,2015561331001,Pt aware that he needs ROV for refill,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,PRESCRIPTION,REFILL,No,2015_5_6133_1001
1,2015561341001,Mom wants to know if the Focalin needs some do...,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,ASK_A_DOCTOR,MEDICATION RELATED,No,2015_5_6134_1001
2,2015561351001,pt called to discuss nortryptiline. she says s...,xxxx-xxxx\f0 \fswiss Arial;}}{\colortbl ;\red2...,ASK_A_DOCTOR,MEDICATION RELATED,No,2015_5_6135_1001
3,2015561361001,FYI Nortryptline medication.,xxxx-xxxx\f0 \fswiss Arial;}}{\colortbl ;\red2...,MISCELLANEOUS,OTHERS,No,2015_5_6136_1001
4,2015561371001,Letter of patient establishment request,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,MISCELLANEOUS,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",No,2015_5_6137_1001


Reduce dataset to text and action category, drop unecessary data points and fix category errors. 

In [3]:
pre_data = pre_data[['SUMMARY', 'categories']].dropna()
def fix(data_frame):
    junk_list = list(data_frame[data_frame['categories'] == 'JUNK'].index)
    data_frame = data_frame.drop(junk_list, axis = 0)
    data_frame = data_frame.replace(['mISCELLANEOUS', 'asK_A_DOCTOR'], ['MISCELLANEOUS','ASK_A_DOCTOR'])
    return data_frame

pre_data = fix(pre_data)
print(collections.Counter(pre_data['categories']))

Counter({'PRESCRIPTION': 14500, 'APPOINTMENTS': 12960, 'ASK_A_DOCTOR': 11744, 'MISCELLANEOUS': 10463, 'LAB': 4246})


# LSTM with Attention 

### Forward Only and BiDirectional

In [4]:
# Load previously trained Word2Vec 5-gram embeddings.

def load_embeddings(embedding_name):
    # Load embeddings.
    embeddings_path = os.path.join(os.path.join(os.getcwd(), embedding_name), embedding_name + ".npy")
    embeddings = np.load(embeddings_path)
    date_time_stamp = embedding_name[5:]
    path_str = str(os.getcwd()) + "/" + embedding_name + "/"
    # Load dictionary.
    dict_path = path_str + "Dictionary" + date_time_stamp + ".pickle"
    with open(dict_path, "rb") as filename:
        dictionary = pickle.load(filename)
    # Load reverse dictionary.
    rev_dict_path = path_str + "RevDictionary" + date_time_stamp + ".pickle"
    with open(rev_dict_path, "rb") as filename:
        reverse_dictionary = pickle.load(filename)
    return embeddings, dictionary, reverse_dictionary

embeddings, dictionary, reverse_dictionary = load_embeddings("Embed_2019-10-06_16.57.49")

In [5]:
# Test that embeddings and dictionary loaded properly. 

embeddings[dictionary['refill']]

array([-3.54758091e-02,  4.42251340e-02, -1.04927778e-01, -6.09349944e-02,
        3.98972146e-02,  9.31131281e-03, -1.25223726e-01,  7.53500080e-03,
       -1.38559267e-01, -1.09519631e-01,  5.10512032e-02, -4.92008999e-02,
        1.50272110e-02, -8.88878256e-02, -5.26870079e-02,  1.06981238e-02,
       -3.02361138e-02,  3.60170864e-02,  1.22638404e-01,  3.42754312e-02,
        1.66808501e-01,  9.66842175e-02, -1.10033818e-01,  9.19144452e-02,
        1.57749012e-01, -7.89843574e-02, -6.32723495e-02,  3.49363312e-02,
       -3.63217667e-02,  1.63602401e-02,  1.64437354e-01, -2.45347177e-03,
        3.99290361e-02, -2.66418164e-03,  3.39282607e-03, -3.54395360e-02,
        8.31402689e-02, -6.25631437e-02,  4.52763308e-03, -6.81839585e-02,
       -5.21346368e-02,  8.64811316e-02, -4.14496334e-03,  7.28750825e-02,
       -7.67338574e-02, -6.07562065e-03,  3.65152992e-02, -1.11119702e-01,
        7.68361986e-02, -9.15446039e-03, -8.86462480e-02,  6.94847032e-02,
        7.60806128e-02, -

# Data Formatting

In [6]:
text_data = pre_data.values[...,0]
text_labels = pre_data.values[...,1]

In [7]:
def encode_labels(labels, print_stats = True):
    unique_labels = np.unique(labels)
    identity = (unique_labels == unique_labels[:,None]).astype(np.float32)
    lookup_dict = dict(zip(unique_labels.tolist(), [identity[i] for i in range(identity.shape[0])]))
    out_labels = (unique_labels == labels[:,None]).astype(np.float32)
    def lookup_function(label):
        return lookup_dict[label]
    def reverse_lookup_function(encoded_label):
        rev_look = [x for x,y in lookup_dict.items() if np.array_equal(y, encoded_label)]
        return rev_look[0]
    def decode_labels(encoded_label_array):
        return np.array([reverse_lookup_function(encoded_label_array[i]) for i in range(encoded_label_array.shape[0])])
    if print_stats:
        if np.array_equal(decode_labels(out_labels), labels):
            print('Successful label encoding.')
            print('Number of classes: ' + str(unique_labels.shape[0]))
            print('Number of labels: ' + str(out_labels.shape[0]))
        else:
            print('Bijection failure.')
    return out_labels, lookup_function, reverse_lookup_function, decode_labels

labels, label2vec, vec2label, decode_labels = encode_labels(text_labels)

Successful label encoding.
Number of classes: 5
Number of labels: 53913


In [8]:
stop_list = [".",",","!","?",":",";","/","(",")","-", "\ufeff", "\"", "\'s"]
measurements = ["mg", "cc", "lb", "lbs", "kg", "kgs"]
end_signature = '[..end..]'

def format_string(string):
    global stop_list
    global measurements
    for item in stop_list:
        string = string.replace(item, " " + item + " ")
    for item in measurements:
        if item in string:
            index = string.find(item)
            if ord(string[index - 1]) in range(48,58):
                string = string[:index] + " " + string[index:]
    return string.lower()

def format_data_LSTM(data):
    new_data = copy.deepcopy(data)
    for i in range(len(data)):
        new_data[i] = np.array([dictionary[x] for x in format_string(new_data[i]).split()])
    print("Formatting complete.")
    return new_data

data = format_data_LSTM(text_data)

Formatting complete.


In [9]:
# Sanity check.

for j in range(5):
    for i in range(len(data[j])):
        print(reverse_dictionary[data[j][i]], end = " ")
    print("<end>")

pt aware that he needs rov for refill <end>
mom wants to know if the focalin needs some dosage adjusting <end>
pt called to discuss nortryptiline . she says she has a weird tas <end>
fyi nortryptline medication . <end>
letter of patient establishment request <end>


## Format Sequences for LSTM

Each string formatted as array of embedding lookup indices (integers). 

### Append Origin to Embeddings:

In [10]:
embedding_dimension = int(embeddings.shape[1])
dictionary[""] = -1
reverse_dictionary[-1] = ""
origin = np.zeros((embedding_dimension,))
embeddings = np.append(embeddings, np.array([origin]), axis = 0)

### Sequence Padding:

In [11]:
# Find max length of sequence in data.

max_length = 0
for i in range(len(data)):
    if len(data[i]) > max_length:
        max_length = len(data[i])
print(max_length)

28


In [12]:
def left_padding(sequence_array):
    global max_length
    out_array = copy.deepcopy(sequence_array)
    for i in range(len(out_array)):
        difference = max_length - len(out_array[i])
        fill = np.full((1, difference), -1, dtype=int)
        out_array[i] = np.concatenate((fill, out_array[i]), axis = None)
    return out_array

def right_padding(sequence_array):
    global max_length
    out_array = copy.deepcopy(sequence_array)
    for i in range(len(out_array)):
        difference = max_length - len(out_array[i])
        fill = np.full((1, difference), -1, dtype=int)
        out_array[i] = np.concatenate((out_array[i], fill), axis = None)
    return out_array

left_padded_data = left_padding(data)

In [13]:
# Sanity Check. 

for j in range(5):
    for i in range(len(left_padded_data[j])):
        if reverse_dictionary[left_padded_data[j][i]] != "":
            print(reverse_dictionary[left_padded_data[j][i]], end = " ")
    print("<end>")

pt aware that he needs rov for refill <end>
mom wants to know if the focalin needs some dosage adjusting <end>
pt called to discuss nortryptiline . she says she has a weird tas <end>
fyi nortryptline medication . <end>
letter of patient establishment request <end>


## Balanced Train/Valid/Test Split

In [14]:
def train_valid_test_split(data_array, label_array, 
                           train_test_ratio = .85, 
                           train_valid_ratio = .85, 
                           shuffle_count = 7,
                           print_shapes = True):
    labels = np.unique(label_array, axis = 0) # An array of the distinct entry values occuring in the argument.
    train_indices = []
    test_indices = []
    valid_indices = []
    for label in labels.tolist():
        label_indices = [i for i,x in enumerate(label_array) if np.array_equal(x, label)]
        instance_count = len(label_indices)
        train_test_partition = int(math.floor(train_test_ratio*instance_count))
        random.shuffle(label_indices)
        to_train_valid = label_indices[:train_test_partition]
        to_test = label_indices[train_test_partition:]
        test_indices = test_indices + to_test
        train_valid_partition = int(math.floor(train_valid_ratio*len(to_train_valid)))
        to_train = to_train_valid[:train_valid_partition]
        to_valid = to_train_valid[train_valid_partition:]
        train_indices = train_indices + to_train
        valid_indices = valid_indices + to_valid
    for i in range(shuffle_count):
        random.shuffle(train_indices)
        random.shuffle(valid_indices)
        random.shuffle(test_indices)
    train_data = np.stack(data_array[train_indices], axis = 0).astype(int)
    train_labels = label_array[train_indices]
    valid_data = np.stack(data_array[valid_indices], axis = 0).astype(int)
    valid_labels = label_array[valid_indices]
    test_data = np.stack(data_array[test_indices], axis = 0).astype(int)
    test_labels = label_array[test_indices]
    if print_shapes:
        print("Train data shape: " + str(train_data.shape))
        print("Train labels shape: " + str(train_labels.shape))
        print("Valid data shape: " + str(valid_data.shape))
        print("Valid labels shape: " + str(valid_labels.shape))
        print("Test data shape: " + str(test_data.shape))
        print("Test labels shape: " + str(test_labels.shape))
    return train_data, train_labels, valid_data, valid_labels, test_data, test_labels

In [15]:
train_data, train_labels, valid_data, valid_labels, test_data, test_labels = train_valid_test_split(left_padded_data,
                                                                                                    labels)

Train data shape: (38949, 28)
Train labels shape: (38949, 5)
Valid data shape: (6876, 28)
Valid labels shape: (6876, 5)
Test data shape: (8088, 28)
Test labels shape: (8088, 5)


### LSTM Batch Generator

Thought: change this all to maintain one data array and simply shuffle the indices of drawn data across epochs. 

In [16]:
batch_size = 64
number_of_unrollings = max_length 
number_of_classes = len(labels[0])
embedding_dimension = len(embeddings[0])
data_index = 0

def generate_LSTM_batch(data, labels, batch_size, number_of_unrollings, embedding_dimension):
    global data_index
    data_index = data_index % len(data)
    batch = np.zeros(shape=(batch_size, number_of_unrollings, embedding_dimension), dtype=np.float32)
    labels_out = np.zeros(shape=(batch_size, number_of_classes), dtype=np.float32)
    for i in range(batch_size):
        batch[i] = embeddings[data[data_index].astype(int)].reshape(1, number_of_unrollings, embedding_dimension)
        labels_out[i] = labels[data_index]
        data_index = (data_index + 1) % len(data)
    return batch, labels_out

valid_total, valid_total_labels = generate_LSTM_batch(valid_data, 
                                                      valid_labels, 
                                                      len(valid_data), 
                                                      number_of_unrollings, 
                                                      embedding_dimension)

test_total, test_total_labels = generate_LSTM_batch(test_data, 
                                                    test_labels, 
                                                    len(test_data), 
                                                    number_of_unrollings, 
                                                    embedding_dimension)

data_index = 14
example_sentence = train_data[data_index]
example_data, example_label = generate_LSTM_batch(train_data, 
                                                  train_labels, 
                                                  1, 
                                                  number_of_unrollings, 
                                                  embedding_dimension)

### Accuracy Scoring Functions:

In [17]:
def first_order_accuracy(predicted_labels, true_labels):
    return(100*np.sum(np.argmax(predicted_labels, 1)==np.argmax(true_labels, 1))
           /predicted_labels.shape[0])

def vanish_max_axis1(array):
    index = np.argmax(array, 1)
    for i in range(len(array)):
        array[i][index[i]] = 0
    return array

def second_order_accuracy(predicted_labels, true_labels):
    second_predicted_labels = vanish_max_axis1(predicted_labels)
    return(100*np.sum(np.argmax(second_predicted_labels, 1)==np.argmax(true_labels, 1))
           /predicted_labels.shape[0])

### Forward LSTM with Attention Graph:

In [18]:
### Hyperparameters ###
number_of_nodes = 64
initial_learning_rate = 1.
init_truncation = .01
#######################

LSTM_UniAttn_graph = tf.Graph()
with LSTM_UniAttn_graph.as_default():
    # Input
    T = tf.placeholder(tf.float32, shape = [batch_size, number_of_unrollings, embedding_dimension])
    Tlabel = tf.placeholder(tf.float32, shape = [batch_size, number_of_classes])
    V = tf.constant(valid_total)
    E = tf.constant(test_total)
    
    # LSTM variables
    # Input gate.
    X_i = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_i = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_i = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Forget gate.
    X_f = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_f = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_f = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Memory cell.                             
    X_c = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_c = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_c = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Output gate.
    X_o = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_o = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_o = tf.Variable(tf.zeros([1, number_of_nodes]))
    
    # Attention variables.
    # Context vector.
    C_attn = tf.Variable(tf.truncated_normal([number_of_nodes//2 , 1], -init_truncation, init_truncation))
    # Attention weights and biases.
    W_attn = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes//2], -init_truncation, init_truncation))
    b_attn = tf.Variable(tf.zeros([number_of_nodes//2]))
    
    # Classifier weights and biases.
    W = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_classes], -init_truncation, init_truncation))
    b = tf.Variable(tf.zeros([number_of_classes]))
    
    # Initialize time 0 state and previous output. 
    initial_output = tf.Variable(tf.zeros([batch_size, number_of_nodes]), trainable=False)
    initial_state = tf.Variable(tf.zeros([batch_size, number_of_nodes]), trainable=False)
     
    def LSTM_cell(sequence_element, previous_output, state):
        input_gate = tf.sigmoid(tf.matmul(sequence_element, X_i) + tf.matmul(previous_output, M_i) + b_i)
        forget_gate = tf.sigmoid(tf.matmul(sequence_element, X_f) + tf.matmul(previous_output, M_f) + b_f)
        update = tf.matmul(sequence_element, X_c) + tf.matmul(previous_output, M_c) + b_c
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(sequence_element, X_o) + tf.matmul(previous_output, M_o) + b_o)
        return output_gate * tf.tanh(state), state

    # Unrolled LSTM loop.
    def LSTM_loop(data, number_of_unrollings, initial_output, initial_state):
        outputs = list()
        output = initial_output
        state = initial_state
        for i in range(number_of_unrollings):
            output, state = LSTM_cell(data[:,i], output, state)
            outputs.append(output)
        return outputs
    
    # Attention model:
    def attention_layer(outputs, W_attn, b_attn, C_attn):
        U = list()
        for i in range(len(outputs)):
            u = tf.tanh(tf.nn.xw_plus_b(outputs[i], W_attn, b_attn))
            U.append(u)
    
        C = list()
        for i in range(len(U)):
            c = tf.matmul(U[i], C_attn)
            C.append(c)
    
        C_concat = tf.concat(C, 1)
        S = tf.nn.softmax(C_concat, 1)
        
        attn_outputs = list()
        for i in range(number_of_unrollings):
            attn_output = tf.multiply(outputs[i], S[:,i][:, tf.newaxis])
            attn_outputs.append(attn_output)
        return attn_outputs, S
    
    outputs = LSTM_loop(T, number_of_unrollings, initial_output, initial_state)
    attn_outputs, S = attention_layer(outputs, W_attn, b_attn, C_attn)
    # attn_outputs, last_attn = attention_layer(outputs, W_attn, b_attn, C_attn)
    # Compute logits and loss for attention-weighted sum of outputs.
    L = tf.nn.xw_plus_b(tf.math.add_n(attn_outputs), W, b)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = Tlabel, logits=L))
    
    # Decaying learning rate for optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.AdagradOptimizer(learning_rate)    
    
    # Apply gradient clipping. 
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    
    # Training predictions. 
    train_predict = tf.nn.softmax(L)
    
    # Test predictions. 
    # Initialize output, state at origin. 
    test_initial_output = tf.Variable(tf.zeros([len(test_total), number_of_nodes]), trainable = False)
    test_initial_state = tf.Variable(tf.zeros([len(test_total), number_of_nodes]), trainable = False)
    
    test_outputs = LSTM_loop(E, number_of_unrollings, test_initial_output, test_initial_state)
    test_attn_outputs, _ = attention_layer(test_outputs, W_attn, b_attn, C_attn)
    test_predict = tf.nn.softmax(tf.nn.xw_plus_b(tf.math.add_n(test_attn_outputs), W, b))

### Session:

In [19]:
number_of_iterations = 1001
data_index = 0

with tf.Session(graph=LSTM_UniAttn_graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for iteration in tqdm(range(number_of_iterations)):
        batch_data, batch_labels = generate_LSTM_batch(train_data, 
                                                       train_labels, 
                                                       batch_size, 
                                                       number_of_unrollings, 
                                                       embedding_dimension)
        feed_dict = {T : batch_data, Tlabel : batch_labels}
        _, l, train_predictions, lr = session.run([optimizer, loss, train_predict, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if iteration == 10:
            ten_attention_coeff = S.eval(feed_dict = feed_dict)
        
    last_attention_coeff = S.eval(feed_dict = feed_dict)
    test_predictions = test_predict.eval()
    
    test_acc1 = first_order_accuracy(test_predictions, test_total_labels)
    test_acc2 = second_order_accuracy(test_predictions, test_total_labels)
    print('====== Test Report ======')
    print('First-order accuracy: %.1f%%' % test_acc1)
    print('Second-order accuracy: %.1f%%' % test_acc2)
    print('Accuracy when first and second choice are counted: %.1f%%' % (test_acc1+test_acc2))

Initialized


100%|██████████████████████████████████████████████████████████████████████████████| 1001/1001 [00:11<00:00, 87.60it/s]


First-order accuracy: 73.2%
Second-order accuracy: 14.7%
Accuracy when first and second choice are counted: 87.9%


In [20]:
# Sanity check.

print("Shape of attention matrix:")
print(last_attention_coeff.shape, end = '\n\n')

print("Iteration 10:", end = "\n\n")
print("Sum / Max / Argmax")
for i in range(3):
    print(np.sum(ten_attention_coeff[i]), end =" / ")
    print(np.max(ten_attention_coeff[i]), end =" / ")
    print(np.argmax(ten_attention_coeff[i]), end = "\n\n")

print("Last iteration:", end = "\n\n")
print("Sum / Max / Argmax")
for i in range(3):
    print(np.sum(last_attention_coeff[i]), end =" / ")
    print(np.max(last_attention_coeff[i]), end =" / ")
    print(np.argmax(last_attention_coeff[i]), end = "\n\n")

print("Attention weights examples:")
for i in range(3):
    print([round(entry, 3) for entry in last_attention_coeff[i]])

Shape of attention matrix:
(64, 28)

Iteration 10:

Sum / Max / Argmax
1.0 / 0.035736352 / 27

0.9999999 / 0.03572114 / 27

0.9999999 / 0.035744026 / 26

Last iteration:

Sum / Max / Argmax
0.99999976 / 0.8471151 / 18

1.0 / 0.7061416 / 27

1.0 / 0.70872587 / 16

Attention weights examples:
[0.001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.114, 0.847, 0.033, 0.0, 0.001, 0.002, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.003, 0.001, 0.001, 0.0, 0.0, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.277, 0.706]
[0.001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005, 0.147, 0.035, 0.004, 0.709, 0.001, 0.009, 0.0, 0.022, 0.002, 0.051, 0.001, 0.0, 0.003, 0.009, 0.0]


### Bidirectional LSTM with Attention Graph:

In [21]:
### Hyperparameters ###
number_of_nodes = 64
initial_learning_rate = 1.
init_truncation = .1
#######################

BiLSTM_graph = tf.Graph()
with BiLSTM_graph.as_default():
    # Input
    T = tf.placeholder(tf.float32, shape = [batch_size, number_of_unrollings, embedding_dimension])
    Tlabel = tf.placeholder(tf.float32, shape = [batch_size, number_of_classes])
    V = tf.constant(valid_total)
    E = tf.constant(test_total)
    
    # Forward LSTM variables
    # Input gate.
    X_if = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_if = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_if = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Forget gate.
    X_ff = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_ff = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_ff = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Memory cell.                             
    X_cf = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_cf = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_cf = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Output gate.
    X_of = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_of = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_of = tf.Variable(tf.zeros([1, number_of_nodes]))
    
    # Backward LSTM variables
    # Input gate.
    X_ib = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_ib = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_ib = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Forget gate.
    X_fb = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_fb = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_fb = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Memory cell.                             
    X_cb = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_cb = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_cb = tf.Variable(tf.zeros([1, number_of_nodes]))
    # Output gate.
    X_ob = tf.Variable(tf.truncated_normal([embedding_dimension, number_of_nodes], -init_truncation, init_truncation))
    M_ob = tf.Variable(tf.truncated_normal([number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_ob = tf.Variable(tf.zeros([1, number_of_nodes]))
    
    # Attention variables.
    # Context vector.
    C_attn = tf.Variable(tf.truncated_normal([number_of_nodes , 1], -init_truncation, init_truncation))
    # Attention weights and biases.
    W_attn = tf.Variable(tf.truncated_normal([2*number_of_nodes, number_of_nodes], -init_truncation, init_truncation))
    b_attn = tf.Variable(tf.zeros([number_of_nodes]))
    
    # Classifier weights and biases.
    W = tf.Variable(tf.truncated_normal([2*number_of_nodes, number_of_classes], -init_truncation, init_truncation))
    b = tf.Variable(tf.zeros([number_of_classes]))
    
    # Initialize time 0 state and previous output. 
    initial_output = tf.Variable(tf.zeros([batch_size, number_of_nodes]), trainable=False)
    initial_state = tf.Variable(tf.zeros([batch_size, number_of_nodes]), trainable=False)
     
    def forward_LSTM_cell(sequence_element, previous_output, state):
        input_gate = tf.sigmoid(tf.matmul(sequence_element, X_if) + tf.matmul(previous_output, M_if) + b_if)
        forget_gate = tf.sigmoid(tf.matmul(sequence_element, X_ff) + tf.matmul(previous_output, M_ff) + b_ff)
        update = tf.matmul(sequence_element, X_cf) + tf.matmul(previous_output, M_cf) + b_cf
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(sequence_element, X_of) + tf.matmul(previous_output, M_of) + b_of)
        return output_gate * tf.tanh(state), state
    
    def backward_LSTM_cell(sequence_element, previous_output, state):
        input_gate = tf.sigmoid(tf.matmul(sequence_element, X_ib) + tf.matmul(previous_output, M_ib) + b_ib)
        forget_gate = tf.sigmoid(tf.matmul(sequence_element, X_fb) + tf.matmul(previous_output, M_fb) + b_fb)
        update = tf.matmul(sequence_element, X_cb) + tf.matmul(previous_output, M_cb) + b_cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(sequence_element, X_ob) + tf.matmul(previous_output, M_ob) + b_ob)
        return output_gate * tf.tanh(state), state

    # Unrolled LSTM loops.
    def forward_LSTM_loop(data, number_of_unrollings, initial_output, initial_state):
        outputs = list()
        output = initial_output
        state = initial_state
        for i in range(number_of_unrollings):
            output, state = forward_LSTM_cell(data[:,i], output, state)
            outputs.append(output)
        return outputs
    
    def backward_LSTM_loop(data, number_of_unrollings, initial_output, initial_state):
        outputs = list()
        output = initial_output
        state = initial_state
        for i in range(number_of_unrollings):
            output, state = backward_LSTM_cell(data[:,-i-1], output, state)
            outputs.append(output)
        return outputs
    
    def back_and_forth_concat(forward_outputs, backward_outputs, number_of_unrollings):
        outputs = list()
        for i in range(number_of_unrollings):
            output = tf.concat([forward_outputs[i], backward_outputs[-i-1]], axis = 1)
            outputs.append(output)
        return outputs
    
    # Attention model:
    def attention_layer(outputs, W_attn, b_attn, C_attn):
        U = list()
        for i in range(len(outputs)):
            u = tf.tanh(tf.nn.xw_plus_b(outputs[i], W_attn, b_attn))
            U.append(u)
    
        C = list()
        for i in range(len(U)):
            c = tf.matmul(U[i], C_attn)
            C.append(c)
    
        C_concat = tf.concat(C, 1)
        S = tf.nn.softmax(C_concat, 1)
        
        attn_outputs = list()
        for i in range(number_of_unrollings):
            attn_output = tf.multiply(outputs[i], S[:,i][:, tf.newaxis])
            attn_outputs.append(attn_output)
        return attn_outputs, S
    
    # Model.
    forward_outputs = forward_LSTM_loop(T, number_of_unrollings, initial_output, initial_state)
    backward_outputs = backward_LSTM_loop(T, number_of_unrollings, initial_output, initial_state)
    outputs = back_and_forth_concat(forward_outputs, backward_outputs, number_of_unrollings)
    attn_outputs, S = attention_layer(outputs, W_attn, b_attn, C_attn)
    
    # Compute logits and loss for attention-weighted sum of outputs.
    L = tf.nn.xw_plus_b(tf.math.add_n(attn_outputs), W, b)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = Tlabel, logits=L))
    
    # Decaying learning rate for optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.AdagradOptimizer(learning_rate)    
    
    # Apply gradient clipping. 
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    
    # Training predictions. 
    train_predict = tf.nn.softmax(L)
    
    # Test predictions. 
    
    # Initialize output, state at origin. 
    test_initial_output = tf.Variable(tf.zeros([len(test_total), number_of_nodes]), trainable = False)
    test_initial_state = tf.Variable(tf.zeros([len(test_total), number_of_nodes]), trainable = False)
    
    test_forward_outputs = forward_LSTM_loop(E, number_of_unrollings, test_initial_output, test_initial_state)
    test_backward_outputs = backward_LSTM_loop(E, number_of_unrollings, test_initial_output, test_initial_state)
    test_outputs = back_and_forth_concat(test_forward_outputs, test_backward_outputs, number_of_unrollings)
    test_attn_outputs, _ = attention_layer(test_outputs, W_attn, b_attn, C_attn)
    test_predict = tf.nn.softmax(tf.nn.xw_plus_b(tf.math.add_n(test_attn_outputs), W, b))
    
    # Example generator.
    evaluation_example = tf.constant(example_data)
    
    example_initial_output = tf.Variable(tf.zeros([1, number_of_nodes]), trainable = False)
    example_initial_state = tf.Variable(tf.zeros([1, number_of_nodes]), trainable = False)
    
    example_forward_outputs = forward_LSTM_loop(evaluation_example, number_of_unrollings, example_initial_output, example_initial_state)
    example_backward_outputs = backward_LSTM_loop(evaluation_example, number_of_unrollings, example_initial_output, example_initial_state)
    example_outputs = back_and_forth_concat(example_forward_outputs, example_backward_outputs, number_of_unrollings)
    example_attn_outputs, S_example = attention_layer(example_outputs, W_attn, b_attn, C_attn)
    example_predict = tf.nn.softmax(tf.nn.xw_plus_b(tf.math.add_n(example_attn_outputs), W, b))
    

### Session:

In [22]:
number_of_iterations = 1001
data_index = 0

with tf.Session(graph=BiLSTM_graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for iteration in tqdm(range(number_of_iterations)):
        batch_data, batch_labels = generate_LSTM_batch(train_data, 
                                                       train_labels, 
                                                       batch_size, 
                                                       number_of_unrollings, 
                                                       embedding_dimension)
        feed_dict = {T : batch_data, Tlabel : batch_labels}
        _, l, train_predictions, lr = session.run([optimizer, loss, train_predict, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if iteration == 10:
            ten_attention_coeff = S.eval(feed_dict = feed_dict)
        
    last_attention_coeff = S.eval(feed_dict = feed_dict)
    
    test = False
    if test:
        test_predictions = test_predict.eval()
        test_acc1 = first_order_accuracy(test_predictions, test_total_labels)
        test_acc2 = second_order_accuracy(test_predictions, test_total_labels)
        print('====== Test Report ======')
        print('First-order accuracy: %.1f%%' % test_acc1)
        print('Second-order accuracy: %.1f%%' % test_acc2)
        print('Accuracy when first and second choice are counted: %.1f%%' % (test_acc1+test_acc2))
    
    # Evaluate the example. 
    example_predict = example_predict.eval()
    example_attn = S_example.eval()

Initialized


100%|██████████████████████████████████████████████████████████████████████████████| 1001/1001 [00:17<00:00, 56.09it/s]


### Example:

In [23]:
print("==== EXAMPLE ====", end = "\n\n")
print("Text:", end = " ")
for i in range(len(example_sentence)):
        if reverse_dictionary[example_sentence[i]] != "":
            print(reverse_dictionary[example_sentence[i]], end = " ")
print("<end>")
print("True Label:", end = " ")
print(example_label.reshape(5), end = " = ")
print(vec2label(example_label.reshape(5)))
print("Prediction Score:", end = " ")
print([round(entry, 3) for entry in example_predict.reshape(5)])
print([round(entry, 3) for entry in example_attn.reshape(28)])

==== EXAMPLE ====

Text: naratriptan rx - pls sign on 3 / 3 request <end>
True Label: [0. 0. 0. 0. 1.] = PRESCRIPTION
Prediction Score: [0.006, 0.022, 0.001, 0.006, 0.965]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.003, 0.004, 0.011, 0.018, 0.851, 0.02, 0.055, 0.001, 0.025, 0.003, 0.0, 0.001, 0.0]
