# UTF-8 Embedding and Decoder Test with Autoencoder

This file is dedicated to test Overfitting Autoencoder to compress the size of the input embedding from the multi-hot to a dense vector



In [1]:
import langmodels.utf8codec as utf8codec
from langmodels.utf8codec import *
import torch.nn.functional as F


In [2]:
# load the codebook and all the dictionaries mapping the data
code_matrix, txt2code, code2txt, txt2num, num2txt = utf8codec._load_codebook()

In [3]:
type(num2txt)

collections.OrderedDict

In [4]:
all_data = np.array(list(num2txt.keys()))
all_data = all_data.reshape((-1,1))

In [5]:
# all_data

In [6]:
# np.random.shuffle(all_data)

In [7]:
# all_data

In [8]:
def _prepare_overfit_batch(num2txt, batch_size):
    """
    The idea is to prepare the list of all the numbers in batches, the batches are randomly mixed to avoid issues.
    each batch contains:
    (batch size, seq width, index)  ??
    (batch size, index)  ??
    :param num2txt: numeric index 2 string conversion dictionary containing the entire vocabulary
    :return:
    """
    # assert type(num2txt) == 'dict'
    all_data = np.array(list(num2txt.keys()))
    all_data = all_data.reshape((-1,1))
#     print(all_data.shape)
    # assume that we can hold all in memory
    arr = []
    for i in range(batch_size):
        data = np.copy(all_data)
#         print(data.shape)
        np.random.shuffle(data)
#         print(data.shape)
        arr.append(data.transpose())
        
    ret = np.stack(arr, axis=1)
    ret = ret.reshape(batch_size,-1)
#     print(ret.shape)
    return ret


In [9]:
# %%time
# btch = _prepare_overfit_batch(num2txt, 100)
# btch = utf8codec._prepare_overfit_batch(num2txt, 100)

In [43]:

def train_overfit(model, optimizer, loss_function, batches, epoch, device, log_interval=10):
#     model.train()
    train_loss = 0
    batch_loss = []
    batch_idx = 0
    for b in batches:
        tensor_data = torch.from_numpy(b).to(device).long()  #.double()  #.float()
        optimizer.zero_grad()
        # emb is obtained from the the pre-computed utf8codebook
        emb, res = model(tensor_data)
        print(emb.shape,emb.dtype, res.shape, res.dtype)
        loss = loss_function(emb, res)
        loss.backward()
        train_loss += loss.data.item()  # [0]
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx , len(batches),
                100. * batch_idx / len(tensor_data),
                train_loss / len(tensor_data)))
            batch_loss.append(train_loss)
        batch_idx += 1
    print('====> Epoch: {} Average loss: {:.8f}'.format(epoch, train_loss / len(batches)))
    return batch_loss


def test(model, test_data, epoch, device):
    model.eval()
    test_loss = 0
    for d in test_data:
        tensor_data = torch.from_numpy(d).to(device)
        res = model(data)
        test_loss += loss_function(tensor_data, res).data.item()  # [0]

    test_loss /= len(test_data)
    print('epoch: {}====> Test set loss: {:.4f}'.format(epoch, test_loss))



In [11]:
# from https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [12]:

def _get_activation_fn(activation):
    if activation == "sigmoid":
        return F.sigmoid
    elif activation == "tanh":
        return F.tanh
    elif activation == "relu":
        return F.relu
    elif activation == "gelu":
        return F.gelu
    else:
        return None
        # raise RuntimeError("activation should be sigmoid/tanh/relu/gelu, not %s." % activation)


In [13]:
model = UTF8Autoencoder(code_matrix)

In [14]:
# prepare many batches so I have everything ready to train
nbatches = 2000
batch_size = 64
batches = []

In [24]:
%%time
for i in range(nbatches):
    btch = _prepare_overfit_batch(num2txt, batch_size)
    batches.append(btch)

CPU times: user 2min 39s, sys: 471 ms, total: 2min 40s
Wall time: 2min 39s


In [27]:
len(batches)

4000

In [16]:
# encoder(batches[0])

In [62]:
epochs = chunker(batches, 100)

In [28]:
device = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)  #.float()

In [61]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, weight_decay=0, amsgrad=False )
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
loss_function = F.mse_loss

In [None]:
epoch_loss = []

In [63]:
%%time
epoch_count = 1
for e in epochs:
    eloss = train_overfit(model, optimizer, loss_function, e, epoch_count, device, log_interval=10)
    epoch_count+=1
#     if epoch_count == 20:
#         print("epoch {} decreasing learning_rate to {}".format(epoch_count, 1e-5))
#         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
    epoch_loss.append(eloss)

torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.Size([64, 1984]) <class 'torch.Tensor'> torch.int64
torch.Size([64, 1984, 324]) torch.float32 torch.Size([64, 1984, 324]) torch.float32
torch.

It seems to be processing about 1.5M chars/sec in my gtx1080

    In [61]: 4000*1984*100                                                                                                             
    Out[61]: 793600000
    
    In [62]: _ / (60*8+49)                                                                                                             
    Out[62]: 1500189.0359168241

And wall time about 1.1M chars/sec:

    In [63]: 4000*1984*100                                                                                                                                        
    Out[63]: 793600000

    In [64]: _ / (60*11+26)                                                                                                                                       
    Out[64]: 1156851.3119533528


In [65]:
len(epoch_loss), len(epoch_loss[-1])

(182, 10)

In [64]:
model.save_model("2segments", "trained_models")

Now what needs to be worked on (with the current model already pre-trained to overfitting for the mapping) is to actually decode to index and character to see the kind of errors in decoding end-to-end.

For this I have to make the decoder from the utf8codebook embedding to the code index and then to the utf-8 character to visually analyze the kind of errors.
