# Test of whether LanguageModelLoader deliver batch where each row is a continuous set of tokens


Status is that the test works with

-MyLanguageModelLoader 

-fastai's LanguageModelLoader, however, throws an exception while indexing the jagged array


In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai.text import * 
from languagemodelloader import *
text.LanguageModelPreLoader = MyLanguageModelPreLoader


In [None]:
def printJagged_(jagged, count=-1):
    if count>= 0: print(f"count:{count}")
    for j in range(len(jagged)): print(f"row {j}:{jagged[j]}")
def getAllBatches(data,epochs=1,log=False):
    x=None
    for i in range(epochs):
        data.on_epoch_begin()            
        countIte=0
        for xb,yb in data:
            countIte += 1
            d= xb.data.numpy()            
            if x is None: 
                x = xb.data.numpy().copy()
            else:         
                x = np.concatenate((x, xb.data.numpy().copy()),axis=1)
            #continue
            #if log:
            #    print(f"epoch{i}")    
            #    display(pd.DataFrame(data=x))
        data.on_epoch_end()
    return x,countIte

In [None]:
def jaggedArrayWithConsecutiveNumbers(nSentences,sentence_length,iterations,minTokens):
    "create jagged array with random layout and filled with consequetive numbers"
    jagged = []
    count = 0
    total = nSentences*sentence_length*iterations
    while count < total:
        nb = total-count if total-count<sentence_length else minTokens+int(np.random.random() * sentence_length)
        jagged.append(np.arange(count+1,count+1+nb))
        count = jagged[-1][-1]
    jagged = np.asarray(jagged)    
    return jagged, count

In [None]:
def get__ri(jagged, toks,backwards):
    ri  = np.zeros_like(toks).flatten()-1
    for i,t in enumerate(toks.flatten()):
        for a in jagged:
            ix = np.flatnonzero((a-t)==0)
            if len(ix) == 0 : continue
            if not backwards:
                if ix[0]+1 < len(a): 
                    # there are tokens left in the sentence
                    ri[i] = ix[0] 
            else:
                if ix[0]>0: 
                    # there are tokens left in the sentence
                    ri[i] = ix[0] 
                
    ri = ri.reshape(toks.shape)
    return ri
    
def test_datadirection( bs,seq_len,sentence_length, iterations, minTokens, nSentences, nEpochs=1,shuffle=False,
                        backwards=False, nbInoutGenerations=10, log=False):
    for i in range(nbInoutGenerations):
        if log:print("\nnew generation")
        jagged,countTokens = jaggedArrayWithConsecutiveNumbers(nSentences,sentence_length,iterations,minTokens)
        if log: printJagged_(jagged)
        
        trainIDS = validIDS = jagged
        db      = TextLMDataBunch.from_ids( ".", None, trainIDS, validIDS, bptt=seq_len, bs=bs, no_check=True)
        data    = MyLanguageModelPreLoader(db.train_ds, bs=bs, bptt=seq_len, backwards=backwards, shuffle=False)
        #data    = LanguageModelPreLoader(db.train_ds, bs=bs, bptt=seq_len, backwards=backwards, shuffle=shuffle)
        dl      = DataLoader(data, bs, shuffle=False)
        batches, countIte = getAllBatches(dl,nEpochs,log)
        if log: 
            print("concatenated batchs")
            display(pd.DataFrame(data=batches))

        assert countIte==len(dl), f"number of iteration does not match: countIte:{countIte}!= len(data):{len(dl)} "
        
        #The diff from one to the next column must be 1 for aligned mini-batches with forward indexing of the data
        #(forward is default for LanguageModelLoader ie.: backwards=False) 
        
        diff_value = -1 if backwards else 1
        nr,nc = batches.shape

        #locate start transitions from sequence to sequence and epoch to epoch
        ixs    = np.arange(1,nEpochs*len(data)//bs)*seq_len
        #print(f"len(data):{len(data)}\nixs:{ixs}\ndiff_value {diff_value}")
        b_diff = batches[:,ixs] - batches[:,ixs-1]
        
        #get the current ofsset into the jagged array. ie if the offset > 0 then the sentence is not 
        #finished and must continue in the nest batch
        ri = get__ri(jagged,batches[:,ixs-1],backwards) #get offset in jagged for the last token in the batch
        b_sub_diff = b_diff[ri>=0]
        if log: 
            print(f"get__ri.toks:\n{batches[:,ixs-1]}\nri:\n{ri}")
            print(f"diff_value {diff_value} \nb_diff:\n{b_diff}\nb_sub_diff:\n{b_sub_diff}")
        if b_sub_diff.size > 0: 
            assert (b_sub_diff.flatten()==diff_value).all(), f"broken sequences ri:\n{ri}\nb_diff:\n{b_diff}"
        elif log: 
            print(f"no expected continuity between batches")

# Test continuouity of tokens in batches loaded forwards and backwards

In [None]:
# create test data so that we can control whether the LanguageModelLoader returns contigous tokens 
#The genrated data must be size so that the batches will not wrap aoround. 
bs         =  8
seq_len    =  3  #=bptt
sentence_length = 1*seq_len
iterations =  1
minTokens  =  1 #in a rag array
shuffle    =  True
nSentences =  10*bs
nEpochs    =  2

In [None]:
test_datadirection( bs, seq_len, sentence_length, iterations, minTokens, nSentences, nEpochs=1, shuffle=shuffle, 
                         backwards=False, nbInoutGenerations=1, log=False)

In [None]:
%time test_datadirection( bs, seq_len, sentence_length, iterations, minTokens, nSentences, \
                   nEpochs=nEpochs, shuffle=shuffle, backwards=False, nbInoutGenerations=1000, log=False)

In [None]:
test_datadirection( bs, seq_len,  sentence_length, iterations, minTokens, nSentences,
                    nEpochs=1, shuffle=shuffle, backwards=True, nbInoutGenerations=1, log=False)

In [None]:
%time test_datadirection( bs, seq_len, sentence_length, iterations, minTokens, nSentences, \
                   nEpochs=nEpochs, shuffle=shuffle, backwards=True, nbInoutGenerations=1000, log=False)