# Test of whether LanguageModelLoader deliver batch where each row is a continuous set of tokens


Status is that the test works with

-MyLanguageModelLoader 

-fastai's LanguageModelLoader, however, throws an exception while indexing the jagged array


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.text import * 
from languagemodelloader import *

In [3]:
def getAllBatches(data,epochs=1):
    x=None
    for i in range(epochs):
        data.on_epoch_begin()
        countIte=0
        for xb,yb in data:
            countIte += 1
            d= xb.data.numpy()            
            if x is None: 
                x = xb.data.numpy().copy()
            else:         
                x = np.concatenate((x, xb.data.numpy().copy()),axis=1)
            continue
        data.on_epoch_end()
    return x,countIte

In [4]:
def jaggedArrayWithConsecutiveNumbers(bs,sentence_length,iterations,minTokens):
    "create jagged array with random layout and filled with consequetive numbers"
    jagged = []
    count = 0
    total = bs*sentence_length*iterations
    #print(f"total:{total}")
    while count < total:
        nb = total-count if total-count<sentence_length else minTokens+int(np.random.random() * sentence_length)
        jagged.append(np.arange(count+1,count+1+nb))
        count = jagged[-1][-1]
    jagged = np.asarray(jagged)    
    return jagged, count

def printJagged(jagged, count):
    print(f"count:{count}")
    for j in jagged: print(j)

In [5]:
def test_datadirection( bs,seq_len,sentence_length, iterations,minTokens, backwards=False, nbInoutGenerations=10, log=False):
    for i in range(nbInoutGenerations):
        jagged,countTokens = jaggedArrayWithConsecutiveNumbers(bs,seq_len,iterations,minTokens)
        if log: 
            print("jagged array")
            printJagged(jagged, countTokens)
            
        trainIDS = validIDS = jagged
        db      = TextLMDataBunch.from_ids( ".", None, trainIDS, validIDS, bptt=seq_len, bs=bs)
        #data    = MyLanguageModelPreLoader(db.train_ds, bs, seq_len, backwards=backwards, shuffle=False, log=False)
        data    = LanguageModelPreLoader(db.train_ds, bs=bs, bptt=seq_len, backwards=backwards, shuffle=False)
        dl      = DataLoader(data, bs, shuffle=False)
        batches, countIte = getAllBatches(dl)
        if log: 
            print("concatenated batchs")
            display(pd.DataFrame(data=batches))

        assert countIte==len(dl), f"number of iteration does not match: countIte:{countIte}!= len(data):{len(dl)} "
        
        #The diff from one to the next column must be 1 for aligned mini-batches with forward indexing of the data
        #(forward is default for LanguageModelLoader ie.: backwards=False) 
        b_diff = batches[:,1:] - batches[:,0:-1]
        if log: 
            print("column diffs")
            display(pd.DataFrame(data=b_diff))
            
        diff_value = -1 if backwards else 1
        assert (b_diff.flatten()==diff_value).all(), "the sequences of batch rows are not contiguous"
        
        ix = np.arange(1,len(batches))
        diff = -1 if backwards else 1
        assert np.all(batches[ix-1,-1]+diff == batches[ix,0]), f"last token i row-1 {batches[ix-1,-1]}+{diff} must be equal to first element in row:{batches[ix,0]}"
        

# Test continouity of tokens in batches loaded forwards and backwards

In [6]:
#create test data so that we can control whether the LanguageModelLoader returns contigous tokens 
#The genrated data must be size so that the batches will not wrap aoround. 
bs         =  4
seq_len    =  3  #=bptt
sentence_length = 7*seq_len
iterations =  2
minTokens  =  1 #in a rag array

In [7]:
%time test_datadirection( bs, seq_len, sentence_length, iterations, minTokens, backwards=False, nbInoutGenerations=1, log=True)

jagged array
count:24
[1 2]
[3 4]
[5]
[6 7]
[8]
[ 9 10]
[11 12]
[13 14 15]
[16 17]
[18 19 20]
[21]
[22 23]
[24]


ValueError: need at least one array to concatenate

In [8]:
%time test_datadirection( bs, seq_len,  sentence_length, iterations, minTokens, backwards=True, nbInoutGenerations=1, log=True)

jagged array
count:24
[1]
[2 3 4]
[5 6 7]
[ 8  9 10]
[11]
[12 13]
[14 15 16]
[17]
[18 19]
[20 21 22]
[23 24]


ValueError: need at least one array to concatenate