# UTF-8 Embedding and Decoder Test with Autoencoder

This file is dedicated to test Overfitting Autoencoder to compress the size of the input embedding from the multi-hot to a dense vector



In [1]:
import langmodels.utf8codec as utf8codec
from langmodels.utf8codec import *
import torch.nn.functional as F
import torch.nn as nn
import torch


Loading faiss with AVX2 support.
Loading faiss.


In [2]:
# load the codebook and all the dictionaries mapping the data
code_matrix, txt2code, code2txt, txt2num, num2txt = utf8codec._load_codebook()

In [3]:
type(num2txt)

collections.OrderedDict

In [4]:
all_data = np.array(list(num2txt.keys()))
all_data = all_data.reshape((-1,1))

In [5]:
# all_data

In [6]:
# np.random.shuffle(all_data)

In [7]:
# all_data

In [8]:
def _prepare_overfit_batch(num2txt, batch_size):
    """
    The idea is to prepare the list of all the numbers in batches, the batches are randomly mixed to avoid issues.
    each batch contains:
    (batch size, seq width, index)  ??
    (batch size, index)  ??
    :param num2txt: numeric index 2 string conversion dictionary containing the entire vocabulary
    :return:
    """
    # assert type(num2txt) == 'dict'
    all_data = np.array(list(num2txt.keys()))
    all_data = all_data.reshape((-1,1))
#     print(all_data.shape)
    # assume that we can hold all in memory
    arr = []
    for i in range(batch_size):
        data = np.copy(all_data)
#         print(data.shape)
        np.random.shuffle(data)
#         print(data.shape)
        arr.append(data.transpose())
        
    ret = np.stack(arr, axis=1)
    ret = ret.reshape(batch_size,-1)
#     print(ret.shape)
    return ret


In [9]:
# %%time
# btch = _prepare_overfit_batch(num2txt, 100)
# btch = utf8codec._prepare_overfit_batch(num2txt, 100)

In [34]:

def train_overfit(model, optimizer, loss_function, batches, epoch, device, log_interval=10):
#     model.train()
    train_loss = 0
    batch_loss = []
    batch_idx = 0
    for b in batches:
        tensor_data = torch.from_numpy(b).to(device).long()  #.double()  #.float()
        optimizer.zero_grad()
        # emb is obtained from the the pre-computed utf8codebook
        emb, res = model(tensor_data)
#         print(emb.shape,emb.dtype, res.shape, res.dtype)
        loss = loss_function(emb, res)
        loss.backward()
        train_loss += loss.data.item()  # [0]
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx , len(batches),
                100. * batch_idx / len(tensor_data),
                train_loss / len(tensor_data)))
            batch_loss.append(train_loss)
        batch_idx += 1
    print('====> Epoch: {} Average loss: {:.8f}'.format(epoch, train_loss / len(batches)))
    return batch_loss


def test(model, test_data, epoch, device):
    model.eval()
    test_loss = 0
    for d in test_data:
        tensor_data = torch.from_numpy(d).to(device)
        res = model(data)
        test_loss += loss_function(tensor_data, res).data.item()  # [0]

    test_loss /= len(test_data)
    print('epoch: {}====> Test set loss: {:.4f}'.format(epoch, test_loss))



In [11]:
# from https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [12]:

# def _get_activation_fn(activation):
#     if activation == "sigmoid":
#         return F.sigmoid
#     elif activation == "tanh":
#         return F.tanh
#     elif activation == "relu":
#         return F.relu
#     elif activation == "gelu":
#         return F.gelu
#     else:
#         return None
#         # raise RuntimeError("activation should be sigmoid/tanh/relu/gelu, not %s." % activation)


In [46]:
model = UTF8Autoencoder(code_matrix, dim=64)

In [14]:
# prepare many batches so I have everything ready to train
nbatches = 4000
batch_size = 64
batches = []

In [47]:
%%time
for i in range(nbatches):
    btch = _prepare_overfit_batch(num2txt, batch_size)
    batches.append(btch)

CPU times: user 5min 23s, sys: 744 ms, total: 5min 24s
Wall time: 5min 24s


In [35]:
len(batches)

4000

In [36]:
# encoder(batches[0])

In [48]:
epochs = chunker(batches, batch_size)

In [49]:
device = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)  #.float()

In [50]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, weight_decay=0, amsgrad=False )
optimizer = torch.optim.AdamW(model.parameters())
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
loss_function = F.mse_loss
# loss_function = F.cross_entropy  # nn.CrossEntropyLoss()
# loss_function = nn.NLLLoss()
# loss_function = F.kl_div  # KL divergence

In [40]:
epoch_loss = []

In [51]:
%%time
epoch_count = 1
for e in epochs:
    eloss = train_overfit(model, optimizer, loss_function, e, epoch_count, device, log_interval=10)
    epoch_count+=1
#     if epoch_count == 20:
#         print("epoch {} decreasing learning_rate to {}".format(epoch_count, 1e-5))
#         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
    epoch_loss.append(eloss)

====> Epoch: 1 Average loss: 0.00598492
====> Epoch: 2 Average loss: 0.00518080
====> Epoch: 3 Average loss: 0.00357404
====> Epoch: 4 Average loss: 0.00333345
====> Epoch: 5 Average loss: 0.00329011
====> Epoch: 6 Average loss: 0.00326059
====> Epoch: 7 Average loss: 0.00321791
====> Epoch: 8 Average loss: 0.00302285
====> Epoch: 9 Average loss: 0.00259405
====> Epoch: 10 Average loss: 0.00429060
====> Epoch: 11 Average loss: 0.00259621
====> Epoch: 12 Average loss: 0.00170907
====> Epoch: 13 Average loss: 0.00096458
====> Epoch: 14 Average loss: 0.00073451
====> Epoch: 15 Average loss: 0.00066962
====> Epoch: 16 Average loss: 0.00063368
====> Epoch: 17 Average loss: 0.00062811
====> Epoch: 18 Average loss: 0.00062659
====> Epoch: 19 Average loss: 0.00062238
====> Epoch: 20 Average loss: 0.00061781
====> Epoch: 21 Average loss: 0.00058747
====> Epoch: 22 Average loss: 0.00056874
====> Epoch: 23 Average loss: 0.00056622
====> Epoch: 24 Average loss: 0.00056436
====> Epoch: 25 Average l

### For an encoding of dimension 32
It seems to be processing about 1.5M chars/sec in my gtx1080

    In [61]: 4000*1984*100                                                                                                             
    Out[61]: 793600000
    
    In [62]: _ / (60*8+49)                                                                                                             
    Out[62]: 1500189.0359168241

And wall time about 1.1M chars/sec:

    In [63]: 4000*1984*100                                                                                                                                        
    Out[63]: 793600000

    In [64]: _ / (60*11+26)                                                                                                                                       
    Out[64]: 1156851.3119533528


    ====> Epoch: 40 Average loss: 0.00016435
    CPU times: user 8min 49s, sys: 2min 38s, total: 11min 28s
    Wall time: 11min 26s
    
    
### For an encoding of dimension 48 

    number of batches: 4000

    ====> Epoch: 63 Average loss: 0.00001490
    CPU times: user 8min 36s, sys: 2min 33s, total: 11min 10s
    Wall time: 11min 10s

Loss has gotten down by a wide margin and processing time seems about the same for this network.

The loss is much less with dimension 48 instead of 32 a lot less epochs, it seems that loss could be made less with more epochs

### For an encoding of dimension 64

    number of batches 8000
    
    ====> Epoch: 125 Average loss: 0.00000317
    CPU times: user 17min 38s, sys: 5min 17s, total: 22min 56s
    Wall time: 22min 56s
    
Processing time rests the same while loss goes down by another order of magnitude.


The issue with dimensionality is that good things start to happen with big dimensions due to the exponential growth in representational power, so for low dimensions many things won't work. The point is to find a balance with vectors of dimension big enough to make the representational power sufficient, and small enough to make it work in my PCs GPU once the network starts to grow with the next iterations on the complexity of the networks.

In [42]:
# len(epoch_loss), len(epoch_loss[-1])

In [52]:
model.save_model("2segments_d64", "trained_models")

Now what needs to be worked on (with the current model already pre-trained to overfitting for the mapping) is to actually decode to index and character to see the kind of errors in decoding end-to-end.

For this I have to make the decoder from the utf8codebook embedding to the code index and then to the utf-8 character to visually analyze the kind of errors.


In [None]:
#TODO FIXME the actual loss reporting is broken and results are BAD, so I have to make something better and include tensorboard