<a href="https://colab.research.google.com/github/kimbo/deep-resolver/blob/master/tranformer_resolver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

  import pandas.util.testing as tm


#Model

## Model Helpers


In [0]:
class PositionwiseFeedForward(nn.Module):
    "Simple linear layers with dropout and relu"
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    
class Embeddings(nn.Module):
    "Create word embeddings"
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

class LayerNorm(nn.Module):
    "Construct a layernorm module "
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

## Encoder

The encoder is composed of a stack of $N=6$ identical layers. 

In [0]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward "
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

## Decoder

The decoder is also composed of a stack of $N=6$ identical layers.  


In [0]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

##Implement Attention


https://arxiv.org/pdf/1706.03762.pdf         
                                                                                                                                                                     

In [0]:
def attention(query, key, value, mask):
    # Compute 'Scaled Dot Product Attention'
    scale = math.sqrt(query.size(-1))

    # scores = QK^T/scale
    scores = torch.bmm(query, key.transpose(-2, -1)) / scale
    # Apply the mask
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
        
    # output = softmax(scores)(V)
    output = F.softmax(scores, dim=-1)
    output = torch.bmm(output, value)
    return output

In [0]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        # Implement Multi-head attention mechanism
        self.h = h
        self.d_k = self.d_v = d_model // h
        
        # Make an attention head (linear layers for q, k, and v)
        # Make h copies of the attention head (Hint: See the `clones()` helper function)
        self.w_qs = clones(nn.Linear(d_model, self.d_k), h)
        self.w_ks = clones(nn.Linear(d_model, self.d_k), h)
        self.w_vs = clones(nn.Linear(d_model, self.d_v), h)

        self.proj = nn.Linear(self.d_v * h, d_model)

    def forward(self, query, key, value, mask):
      residual = query
      outputs = []
      for w_q, w_k, w_v in zip(self.w_qs, self.w_ks, self.w_vs):
        q_i = w_q(query)
        k_i = w_k(key)
        v_i = w_v(value)
        output = attention(q_i, k_i, v_i, mask)
        outputs += [output]
      out = torch.cat(outputs, dim=2)
      out = self.proj(out)
      return out

## Positional Encoding                                                                                                                             
Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence.  To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks.  The positional encodings have the same dimension $d_{\text{model}}$ as the embeddings, so that the two can be summed.   There are many choices of positional encodings, learned and fixed [(cite)](https://arxiv.org/pdf/1705.03122.pdf). 

In this work, we use sine and cosine functions of different frequencies:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
$$PE_{(pos,2i)} = sin(pos / 10000^{2i/d_{\text{model}}})$$

$$PE_{(pos,2i+1)} = cos(pos / 10000^{2i/d_{\text{model}}})$$                                                                                                                                                                                                                                                        
where $pos$ is the position and $i$ is the dimension.  That is, each dimension of the positional encoding corresponds to a sinusoid.  The wavelengths form a geometric progression from $2\pi$ to $10000 \cdot 2\pi$.  We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$. 

In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks.  For the base model, we use a rate of $P_{drop}=0.1$. 
                                                                                                                                                                                                                                                    


In [0]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = 1 / (10000 ** (torch.arange(0., d_model, 2) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

## Full Model

In [0]:
class TransformerModel(nn.Module):
    """
    Full transformer model
    """
    def __init__(self, src_vocab, tgt_vocab, N=6, d_model=256, d_ff=1024, h=8, dropout=0.1):
        super(TransformerModel, self).__init__()
        
        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        c = copy.deepcopy
        
        self.encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N)
        self.decoder = Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N)
        self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), c(position))
        self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), c(position))
        self.generator = Generator(d_model, tgt_vocab)
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

# Training


## Batches and Masking

In [0]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask
    
    
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

## Label Smoothing

During training, we employed label smoothing of value $\epsilon_{ls}=0.1$ [(cite)](https://arxiv.org/abs/1512.00567).  This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.  

In [0]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

## Data Loading


## Training Code

In [0]:
from torchtext import data, datasets

class LossFunction:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()
        return loss.data * norm

class DataIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

def rebatch(pad_idx, batch):
    "Fix order in torchtext to match ours"
    src, trg = batch.src.transpose(0, 1).cuda(), batch.trg.transpose(0, 1).cuda()
    return Batch(src, trg, pad_idx)

    
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens
    

##Train

In [12]:
# Download DNS queries and responses (questions and answers)
!wget  -O ./queries "https://kimbo.s3-us-west-1.amazonaws.com/dl/current/all-A-questions-only.txt"
!wget -O ./responses "https://kimbo.s3-us-west-1.amazonaws.com/dl/current/all-A-answers-only.txt"

--2020-04-13 21:33:49--  https://kimbo.s3-us-west-1.amazonaws.com/dl/current/all-A-questions-only.txt
Resolving kimbo.s3-us-west-1.amazonaws.com (kimbo.s3-us-west-1.amazonaws.com)... 52.219.120.81
Connecting to kimbo.s3-us-west-1.amazonaws.com (kimbo.s3-us-west-1.amazonaws.com)|52.219.120.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7017655 (6.7M) [text/plain]
Saving to: ‘./queries’


2020-04-13 21:33:51 (5.22 MB/s) - ‘./queries’ saved [7017655/7017655]

--2020-04-13 21:33:53--  https://kimbo.s3-us-west-1.amazonaws.com/dl/current/all-A-answers-only.txt
Resolving kimbo.s3-us-west-1.amazonaws.com (kimbo.s3-us-west-1.amazonaws.com)... 52.219.112.137
Connecting to kimbo.s3-us-west-1.amazonaws.com (kimbo.s3-us-west-1.amazonaws.com)|52.219.112.137|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18247656 (17M) [text/plain]
Saving to: ‘./responses’


2020-04-13 21:33:55 (9.58 MB/s) - ‘./responses’ saved [18247656/18247656]



In [13]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [14]:
from torchtext import data, datasets
import torchtext
import spacy
import pdb

nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    return [tok.text for tok in nlp.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize, init_token=BOS_WORD, 
                 eos_token=EOS_WORD, pad_token=BLANK_WORD)

print("Loading Dataset...")
limit = 100000
with open('./queries', 'r') as qf, open('./responses', 'r') as rf:
  query_lines = list(map(lambda l: l.strip().strip('?'), qf))[:limit]
  response_lines = list(map(lambda l: l.strip(), rf))[:limit]

assert len(query_lines) == len(response_lines)

print('Loaded {} query/response pairs'.format(len(response_lines)))
print('Example query: {}'.format(query_lines[0]))
print('Example response: {}'.format(response_lines[0]))

Loading Dataset...
Loaded 100000 query/response pairs
Example query: ipayment.com. A
Example response: ipayment.com. 600 IN A 184.168.131.241


In [15]:
import tqdm

fields = (["src", SRC], ["trg", TGT])
examples = [torchtext.data.Example.fromlist((query_lines[i], response_lines[i]), fields) 
            for i in tqdm.trange(len(response_lines), position=0, desc='Creating examples...')]

print('Creating torchtext.data.Dataset...')
train, val = torchtext.data.Dataset(examples, fields=fields).split()
print('Building vocab...')
MIN_FREQ = 1
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
print('SRC vocab done.')
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
print('TGT vocab done.')

Creating examples...: 100%|██████████| 100000/100000 [00:32<00:00, 3089.10it/s]


Creating torchtext.data.Dataset...
Building vocab...
SRC vocab done.
TGT vocab done.


In [19]:
import gc
gc.collect()

N = 2
pad_idx = TGT.vocab.stoi["<blank>"]
model = TransformerModel(len(SRC.vocab), len(TGT.vocab), N=N).cuda()
n_epochs = 100
device = torch.device('cuda')
lr = 5e-4
# lr = 0.003

def scope():
    criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    criterion.cuda()
    BATCH_SIZE = 1000
    train_iter = DataIterator(train, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)
    valid_iter = DataIterator(val, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=False)

    model_opt = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(n_epochs):
        model.train()
        run_epoch((rebatch(pad_idx, b) for b in train_iter), 
                  model, 
                  LossFunction(model.generator, criterion, model_opt))
        model.eval()
scope()

Epoch Step: 1 Loss: 10.243751 Tokens per Sec: 2480.402588
Epoch Step: 51 Loss: 4.558431 Tokens per Sec: 4216.205566
Epoch Step: 101 Loss: 3.945638 Tokens per Sec: 4255.244141
Epoch Step: 151 Loss: 3.607424 Tokens per Sec: 4184.705078
Epoch Step: 201 Loss: 3.465854 Tokens per Sec: 4261.685547
Epoch Step: 251 Loss: 3.448779 Tokens per Sec: 4140.226074
Epoch Step: 301 Loss: 3.491423 Tokens per Sec: 4172.465332
Epoch Step: 351 Loss: 3.414208 Tokens per Sec: 4173.915039
Epoch Step: 401 Loss: 2.993244 Tokens per Sec: 4152.592285
Epoch Step: 451 Loss: 3.395380 Tokens per Sec: 4241.622559
Epoch Step: 501 Loss: 3.394483 Tokens per Sec: 4076.358887
Epoch Step: 551 Loss: 3.392555 Tokens per Sec: 4181.548828
Epoch Step: 601 Loss: 2.877407 Tokens per Sec: 4294.947754
Epoch Step: 1 Loss: 3.416408 Tokens per Sec: 2533.921631
Epoch Step: 51 Loss: 3.446699 Tokens per Sec: 4246.486328
Epoch Step: 101 Loss: 3.439760 Tokens per Sec: 4260.419922
Epoch Step: 151 Loss: 3.466607 Tokens per Sec: 4132.993652
Ep

KeyboardInterrupt: ignored

## Translate

In [17]:
!pip install dnspython



In [21]:
torch.save(model, 'deepresolver-model.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [20]:
import pdb
import dns.message

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

BATCH_SIZE = 1000
n_train_iters = len(train) / BATCH_SIZE
valid_iter = DataIterator(val, batch_size=BATCH_SIZE, device=device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=False)

for i, batch in enumerate(valid_iter):
    src = batch.src.transpose(0, 1)[:1].cuda()
    src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2).cuda()
    out = greedy_decode(model, src, src_mask, 
                        max_len=305, start_symbol=TGT.vocab.stoi["<s>"])
    print("Query:", end="\t\t\t\t")
    q = ''
    for i in range(0, src.size(1)):
        sym = SRC.vocab.itos[src[0, i]]
        if sym == "</s>": break
        q += sym
        if i > 0:
          q += " "
        #print(sym, end =" ")
    print(q)
    print()
    print("Generated response:", end="\t\t")
    s = "\n"
    for i in range(1, out.size(1)):
        sym = TGT.vocab.itos[out[0, i]]
        if sym == "</s>": break
        print(sym, end =" ")
        s += sym
        if i > 1:
          s += " "
          print(" ", end="")
    # try:
    #   msg = dns.message.from_text(s)
    # except Exception as e:
    #   print('ERROR DECODING DNS MESSAGE: {}'.format(e))
    #   print('\n{}'.format(s))
    # else:
    #   print("SUCCESS!!!\n{}".format(msg))
    print('\n')
    print("Target response:\t", end="\t")
    r = ""
    for i in range(1, batch.trg.size(0)):
        sym = TGT.vocab.itos[batch.trg.data[i, 0]]
        if sym == "</s>": break
        #print(sym, end =" ")
        r += sym
        if i > 1:
          r += " "
    print(r)
    print()
    
    if i > 1000 and i < 1100:
        break

Query:				<unk>. A 

Generated response:		sharevideo1.com .  600  IN  A  162.242.193.45  

Target response:		<unk>. 21600 IN A <unk> 

Query:				<unk>. A 

Generated response:		elcheapo.tv .  14400  IN  A  162.241.219.194  

Target response:		<unk>. 600 IN A <unk> 

Query:				<unk>. A 

Generated response:		sharevideo1.com .  600  IN  A  35.231.33.159  

Target response:		<unk>. 21599 IN A <unk> 

Query:				<unk>. A 

Generated response:		tritonsws.com .  3600  IN  A  50.62.57.138  

Target response:		<unk>. 30 IN A <unk> 

Query:				<unk>. A 

Generated response:		elcheapo.tv .  14400  IN  A  162.208.49.126  

Target response:		<unk>. 7200 IN A <unk> 

Query:				<unk>. A 

Generated response:		latinbayarea.com .  7200  IN  A  206.188.192.247  

Target response:		<unk>. 1800 IN A <unk> 



KeyboardInterrupt: ignored

<div id="disqus_thread"></div>
<script>
    /**
     *  RECOMMENDED CONFIGURATION VARIABLES: EDIT AND UNCOMMENT THE SECTION BELOW TO INSERT DYNAMIC VALUES FROM YOUR PLATFORM OR CMS.
     *  LEARN WHY DEFINING THESE VARIABLES IS IMPORTANT: https://disqus.com/admin/universalcode/#configuration-variables
     */
    /*
    var disqus_config = function () {
        this.page.url = PAGE_URL;  // Replace PAGE_URL with your page's canonical URL variable
        this.page.identifier = PAGE_IDENTIFIER; // Replace PAGE_IDENTIFIER with your page's unique identifier variable
    };
    */
    (function() {  // REQUIRED CONFIGURATION VARIABLE: EDIT THE SHORTNAME BELOW
        var d = document, s = d.createElement('script');
        
        s.src = 'https://EXAMPLE.disqus.com/embed.js';  // IMPORTANT: Replace EXAMPLE with your forum shortname!
        
        s.setAttribute('data-timestamp', +new Date());
        (d.head || d.body).appendChild(s);
    })();
</script>
<noscript>Please enable JavaScript to view the <a href="https://disqus.com/?ref_noscript" rel="nofollow">comments powered by Disqus.</a></noscript>