In [10]:
# Code by Sarah Wiegreffe (saw@gatech.edu)
# Fall 2019

import numpy as np

import torch
from torch import nn
import random

####### Do not modify these imports.
import torch.nn.functional as F
import math
from torch.autograd import Variable
import math, copy, time
class ClassificationTransformer(nn.Module):
    """
    A single-layer Transformer which encodes a sequence of text and 
    performs binary classification.

    The model has a vocab size of V, works on
    sequences of length T, has an hidden dimension of H, uses word vectors
    also of dimension H, and operates on minibatches of size N.
    """
    def __init__(self, word_to_ix, hidden_dim=128, num_heads=2, dim_feedforward=2048, dim_k=96, dim_v=96, dim_q=96, max_length=43):
        '''
        :param word_to_ix: dictionary mapping words to unique indices
        :param hidden_dim: the dimensionality of the output embeddings that go into the final layer
        :param num_heads: the number of Transformer heads to use
        :param dim_feedforward: the dimension of the feedforward network model
        :param dim_k: the dimensionality of the key vectors
        :param dim_q: the dimensionality of the query vectors
        :param dim_v: the dimensionality of the value vectors
        '''        
        super(ClassificationTransformer, self).__init__()
        assert hidden_dim % num_heads == 0
        
        self.num_heads = num_heads
        self.word_embedding_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.dim_feedforward = dim_feedforward
        self.max_length = max_length
        self.vocab_size = len(word_to_ix)
        
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.dim_q = dim_q
        
        seed_torch(0)
        
        ##############################################################################
        # Deliverable 1: Initialize what you need for the embedding lookup (1 line). #
        # Hint: you will need to use the max_length parameter above.                 #
        ##############################################################################
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        # import math
        # pe = torch.zeros(max_length, hidden_dim)
        # for pos in range(max_length):
        #     for i in range(0, hidden_dim, 2):
        #         pe[pos, i] = \
        #         math.sin(pos / (10000 ** ((2 * i)/hidden_dim)))
        #         pe[pos, i + 1] = \
        #         math.cos(pos / (10000 ** ((2 * (i + 1))/hidden_dim)))
                
        # pe = pe.unsqueeze(0)
#         self.emb_posi = PositionalEncoding(hidden_dim, 0, max_length)
        self.emb_posi = nn.Embedding(max_length, hidden_dim)
        self.emb_word = nn.Embedding(self.vocab_size, hidden_dim)
        # self.embed = PositionalEncoder()
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        
        
        ##############################################################################
        # Deliverable 2: Initializations for multi-head self-attention.              #
        # You don't need to do anything here. Do not modify this code.               #
        ##############################################################################
        
        # Head #1
        self.k1 = nn.Linear(self.hidden_dim, self.dim_k)
        self.v1 = nn.Linear(self.hidden_dim, self.dim_v)
        self.q1 = nn.Linear(self.hidden_dim, self.dim_q)
        
        # Head #2
        self.k2 = nn.Linear(self.hidden_dim, self.dim_k)
        self.v2 = nn.Linear(self.hidden_dim, self.dim_v)
        self.q2 = nn.Linear(self.hidden_dim, self.dim_q)
        
        self.softmax = nn.Softmax(dim=2)
        self.attention_head_projection = nn.Linear(self.dim_v * self.num_heads, self.hidden_dim)
        self.norm_mh = nn.LayerNorm(self.hidden_dim)

        
        ##############################################################################
        # Deliverable 3: Initialize what you need for the feed-forward layer.        # 
        # Don't forget the layer normalization.                                      #
        ##############################################################################
        self.fc1 = nn.Linear(self.hidden_dim, self.dim_feedforward, bias = True)
        self.fc2 = nn.Linear(self.dim_feedforward, self.hidden_dim, bias = True)
        self.norm_fc = nn.LayerNorm(self.hidden_dim)
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################

        
        ##############################################################################
        # Deliverable 4: Initialize what you need for the final layer (1-2 lines).   #
        ##############################################################################
        self.classify = nn.Linear(self.hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################

        
    def forward(self, inputs):
        '''
        This function computes the full Transformer forward pass.
        Put together all of the layers you've developed in the correct order.

        :param inputs: a PyTorch tensor of shape (N,T). These are integer lookups. 

        :returns: the model outputs. Should be normalized scores of shape (N,1).
        '''
        outputs = None
        #############################################################################
        # Deliverable 5: Implement the full Transformer stack for the forward pass. #
        # You will need to use all of the methods you have previously defined above.#
        # You should only be calling ClassificationTransformer class methods here.  #
        #############################################################################
        x = self.embed(inputs)
        x = self.multi_head_attention(x)
        x = self.feedforward_layer(x)
        outputs = self.final_layer(x)
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return outputs
    
    
    def embed(self, inputs):
        """
        :param inputs: intTensor of shape (N,T)
        :returns embeddings: floatTensor of shape (N,T,H)
        """
        embeddings = None
        #############################################################################
        # Deliverable 1: Implement the embedding lookup.                            #
        # Note: word_to_ix has keys from 0 to self.vocab_size - 1                   #
        # This will take a few lines.                                               #
        #############################################################################
        # N_size = inputs.size()[0]
        # T_size = inputs.size()[1]
        N_size, T_size = inputs.shape
        tmp_embd = torch.zeros(N_size, T_size, self.hidden_dim)
        # for i in range(T_size):
        #     tmp_embd[:, i, :] += self.emb_word(inputs[:, i])
        #     tmp_embd[:, i, :] += self.emb_posi(i * torch.ones(N_size, dtype = torch.long))

        embeddings = tmp_embd
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return embeddings
        
    def multi_head_attention(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,T,H)
        
        Traditionally we'd include a padding mask here, so that pads are ignored.
        This is a simplified implementation.
        """
        
        outputs = None
        #############################################################################
        # Deliverable 2: Implement multi-head self-attention followed by add + norm.#
        # Use the provided 'Deliverable 2' layers initialized in the constructor.   #
        #############################################################################
        
        N = inputs.size()[0]
        T = inputs.size()[1]

        Q1 = self.q1(inputs)
        K1 = self.k1(inputs)
        V1 = self.v1(inputs)

        Q2 = self.q2(inputs)
        K2 = self.k2(inputs)
        V2 = self.v2(inputs)


        mat1 = self.softmax( torch.bmm(Q1, K1.transpose(1,2)) / math.sqrt(self.dim_k))
        attention1 = torch.matmul(mat1, V1)

        mat2 = self.softmax( torch.bmm(Q2, K2.transpose(1,2) ) / math.sqrt(self.dim_k))
        attention2 = torch.matmul(mat2, V2)


        attention = torch.cat((attention1, attention2), dim = 2)
        sublayer = self.attention_head_projection(attention)
        outputs = self.norm_mh(inputs + sublayer)
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return outputs
    
    
    def feedforward_layer(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,T,H)
        """
        outputs = None
        #############################################################################
        # Deliverable 3: Implement the feedforward layer followed by add + norm.    #
        # Use a ReLU activation and apply the linear layers in the order you        #
        # initialized them.                                                         #
        # This should not take more than 3-5 lines of code.                         #
        #############################################################################
        x = self.fc1(inputs)
        x = F.relu(x)
        x = self.fc2(x)
        outputs = self.norm_fc(inputs + x)
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return outputs
        
    
    def final_layer(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,1)
        """
        outputs = None
        #############################################################################
        # Deliverable 4: Implement the final layer for the Transformer classifier.  #
        # This should not take more than 2 lines of code.                         #
        #############################################################################
        x = self.classify(inputs[:, 0, :])
        outputs = self.sigmoid(x)
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return outputs
        

def seed_torch(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


In [11]:
import numpy as np
import csv
import torch

train_inxs = np.load('./gt_7643/datasets/train_inxs.npy')
val_inxs = np.load('./gt_7643/datasets/val_inxs.npy')
train_labels = np.load('./gt_7643/datasets/train_labels.npy')
val_labels = np.load('./gt_7643/datasets/val_labels.npy')

# load dictionary
word_to_ix = {}
with open("./gt_7643/datasets/word_to_ix.csv", "r") as f:
    reader = csv.reader(f)
    for line in reader:
        word_to_ix[line[0]] = line[1]
print("Vocabulary Size:", len(word_to_ix))
        
print(train_inxs.shape) # 7000 training instances, of (maximum/padded) length 43 words.
print(val_inxs.shape) # 1551 validation instances, of (maximum/padded) length 43 words.
print(train_labels.shape)
print(val_labels.shape)

# load checkers
d1 = torch.load('./gt_7643/datasets/d1.pt')
d2 = torch.load('./gt_7643/datasets/d2.pt')
d3 = torch.load('./gt_7643/datasets/d3.pt')
d4 = torch.load('./gt_7643/datasets/d4.pt')

Vocabulary Size: 1542
(7000, 43)
(1551, 43)
(7000,)
(1551,)


In [12]:
inputs = train_inxs[0:2]
inputs = torch.LongTensor(inputs)

model = ClassificationTransformer(word_to_ix, hidden_dim=128, num_heads=2, dim_feedforward=2048, dim_k=96, 
                                  dim_v=96, dim_q=96, max_length=train_inxs.shape[1])

embeds = model.embed(inputs)

try:
    print("Difference:", torch.sum(torch.pairwise_distance(embeds, d1)).item()) # should be very small (<0.01)
except:
    print("NOT IMPLEMENTED")

Difference: 2368.49365234375


In [15]:
inputs

tensor([  10,   12,    0,    0,   13,   14,    0,   15, 1540, 1541, 1541, 1541,
        1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541,
        1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541, 1541,
        1541, 1541, 1541, 1541, 1541, 1541, 1541])

In [50]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=43):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)
    

In [87]:
class PositionalEncoder_2(nn.Module):
    def __init__(self, max_seq_len = 43, d_model=128):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, inputs):
        # make embeddings relatively larger
        inputs = inputs * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = inputs.size(1)
        inputs = inputs + Variable(self.pe[:,:seq_len], \
        requires_grad=False)
        return inputs
    
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [75]:
train_inxs.shape

(7000, 43)

In [76]:
pe = PositionalEncoding(128,0 )
y = pe.forward(d1)
y.shape

torch.Size([2, 43, 128])

In [77]:
inputs = train_inxs[0:2]
inputs = torch.LongTensor(inputs)

In [78]:
N, T = inputs.shape

In [79]:
tmp_embd = torch.zeros(N, T, 128)

In [80]:
posi = nn.Embedding(43, 128)
word = nn.Embedding(len(word_to_ix), 128)
posi_2 = PositionalEncoder_2(43,128)

In [94]:
inputs = train_inxs[0:2]
inputs = torch.LongTensor(inputs)

model = ClassificationTransformer(word_to_ix, hidden_dim=128, num_heads=2, dim_feedforward=2048, dim_k=96, 
                                  dim_v=96, dim_q=96, max_length=train_inxs.shape[1])

embeds = posi_2(word(inputs))

try:
    print("Difference:", torch.sum(torch.pairwise_distance(embeds, d1)).item()) # should be very small (<0.01)
except:
    print("NOT IMPLEMENTED")

Difference: 17768.669921875


In [101]:
ok = np.ones((3,3,3))


In [120]:
for i in range(3):
    for j in range(3):
        for k in range(3):
            ok[i][j][k] = (i+1)*(j+2)+(k+3)
    
ok

array([[[ 5.,  6.,  7.],
        [ 6.,  7.,  8.],
        [ 7.,  8.,  9.]],

       [[ 7.,  8.,  9.],
        [ 9., 10., 11.],
        [11., 12., 13.]],

       [[ 9., 10., 11.],
        [12., 13., 14.],
        [15., 16., 17.]]])

In [123]:
ok[0,:,:]

array([[5., 6., 7.],
       [6., 7., 8.],
       [7., 8., 9.]])