In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai import * 
from fastai.text import * 
import numpy as np
import io
import pathlib
import asyncio
from fastai_sentencepiece import *
from filetokenizer import *
from languagemodelloader import *

In [3]:
lang="en"
minToks = 10

In [4]:
#pathData       = Path("../../data/nlp")
pathData       = Path("../nlp-data")
path           = pathData / lang
pathDump       = path/"wiki-dump"
pathJson       = path/"wiki-json"

pathTrainValid = path/"wiki-train_valid"
pathTxt        = pathTrainValid/"txt"
pathToks       = pathTrainValid/"toks"
pathcsv        = pathTrainValid/"wiki.csv"

cache_name   = "sp-model"
pathVocab    = pathTrainValid / cache_name

In [5]:
#Discard section with kess than "minTok" tokens
spt_func  = partial(SentencepieceTokenizer.create, pathVocab=pathVocab)
spt_func.__name__ = SentencepieceTokenizer.__name__
spt       = spt_func(lang="en")
pad_idx   = spt.vocab().numericalize([text.transform.PAD])[0]
vocab,max_vocab = spt.vocab(), len(spt.vocab().itos)
trainTokenizer = FileTokenizer(pathToks/"train", spt_func,"en",vocab,minToks=minToks,n_cpus=max(defaults.cpus-1,1))
validTokenizer = FileTokenizer(pathToks/"valid", spt_func,"en",vocab,minToks=minToks,n_cpus=max(defaults.cpus-1,1))

print(trainTokenizer)
print("size og vocabulary:", max_vocab)
print("pad_idx:",pad_idx)

print(spt.vocab().numericalize( ["xxunk" ,"xxbos","xxeos" ,"xxpad" ,"xxmaj" ,"xxup" ,"xxrep" ,"xxwrep", "xxfld"]  ))
#sentence = ["She is tall.", "He is small"]
#tokenizer._process_all_1(sentence)

self.dtype:<class 'numpy.int16'>
self.dtype:<class 'numpy.int16'>
Tokenizer SentencepieceTokenizer in en with the following rules:
 - fix_html
 - replace_rep
 - replace_wrep
 - spec_add_spaces
 - rm_useless_spaces
 - rm_extra_lineshift
 - replace_all_caps
 - deal_caps

size og vocabulary: 32000
pad_idx: 2
[0, 1, 0, 2, 3, 4, 5, 6, 7]


In [6]:
files   = np.asarray( list(pathTxt.glob("*.txt")) )
nrows   = len(files)
split   = 0.2
splitindex, index = int(nrows*split+.5), np.random.permutation(np.arange(nrows)) 

chunksize=0

trainList = TextList( files[:-splitindex], vocab=vocab, pad_idx=pad_idx, 
                      processor=[FileTokenizeProcessor(tokenizer=trainTokenizer, 
                                                       chunksize=chunksize, mark_fields=False)])

validList = TextList( files[-splitindex:], vocab=vocab, pad_idx=pad_idx, 
                      processor=[FileTokenizeProcessor(tokenizer=validTokenizer, 
                                                       chunksize=chunksize, mark_fields=False)])


In [7]:
%time trainIDS = trainTokenizer.getIds()

CPU times: user 1min 50s, sys: 7.87 s, total: 1min 58s
Wall time: 2min 4s


In [8]:
%time validIDS=validTokenizer.getIds()

CPU times: user 35.3 s, sys: 3.74 s, total: 39 s
Wall time: 39.7 s


In [9]:
from languagemodelloader import *
nTrainToks, nValidToks = int(5e6),int(1e6)
#nTrainToks, nValidToks = -1,-1
if nTrainToks>0 and nValidToks>0:
    trainIDS_ = trainIDS[0:nTrainToks]
    validIDS_ = validIDS[0:nValidToks]
else:
    trainIDS_ = trainIDS
    validIDS_ = validIDS

dblm = MyTextLMDataBunch.from_ids( pathTrainValid, vocab, trainIDS_, validIDS_, bptt=400, p_bptt=0.0, bs=6)


MyTextLMDataBunch def create
LanguageModelLoader.__init__ Used GB memory:9.35 batches:169556 nToks:406932154 bptt:400 p_bptt:0.0 shuffle:True backwards:False
LanguageModelLoader.__init__ Used GB memory:9.35 batches:35863 nToks:86070425 bptt:400 p_bptt:0.0 shuffle:False backwards:False
LanguageModelLoader.__init__ Used GB memory:9.38 batches:169556 nToks:406932154 bptt:400 p_bptt:0.0 shuffle:False backwards:False


In [10]:
len(trainIDS)/1e6

24.871686

In [11]:
len(dblm.train_dl.x.items)
xc = dblm.train_dl.x.items
r = xc[0]
print(f"{xc[0][0]} - {dblm.train_dl.x.items[0][0]} - {r[0]}")
dblm.train_dl.x.items[0][0]=2
print(f"{xc[0][0]} - {dblm.train_dl.x.items[0][0]} - {r[0]}")
xc[0].size

4464 - 4464 - 4464
2 - 2 - 2


134

In [None]:
#%load_ext line_profiler

data = MyLanguageModelLoader(dblm.train_dl,bs=32)
print(len(data))
def getAllBatches(data,epochs=1):
    from fastprogress import master_bar, progress_bar
    mb = master_bar(range(epochs))
    for i in mb:
        for xb,yb in progress_bar(data, parent=mb):
            continue
            
def getAllBatches2(data,epochs=1):
    for i in range(epochs):
        for xb,yb in data:
            continue
#%time getAllBatches(data)
#%lprun -f MyLanguageModelLoader.fill_row getAllBatches2(data)
#%lprun -f MyLanguageModelLoader.CircularIndex.__getitem__ getAllBatches2(data)
%time getAllBatches2(data)

LanguageModelLoader.__init__ Used GB memory:10.67 batches:181667 nToks:406932154 bptt:70 p_bptt:0.95 shuffle:False backwards:False
181667
LanguageModelLoader.allocate_buffers Used GB memory:10.67 shuffle:False backwards:False


In [None]:
toks   =  np.asarray([v for v in spt.vocab().itos])
tokslen = np.asarray([len(v) for v in spt.vocab().itos])
ix_sort = np.argsort(tokslen)
#tokslen = tokslen[ix_sort]
#toks    = toks[ix_sort]
print("number og len(toks)=)1", np.sum(tokslen==1))

In [None]:
charaters = ",".join(toks[tokslen==1])
#np.sum(toks=='a')
charaters

In [None]:
spt.vocab().numericalize(["a","b","ab","xxbos"])

In [None]:
spt.vocab().textify([56,299])

In [None]:
#np.asarray([65536],dtype=np.int16)
np.asarray([65536/2-1],dtype=np.int16)

In [None]:
text="Oak Creek, Colorado\n\n\nOak Creek is a Statutory Town in Routt County, Colorado, United States. The population was 849 at the 2000 census. It was incorporated in 1907 as a coal mining town. The community was named for scrub oak near the original town site.\nOak Creek is located at (40.275049, -106.957607)."
text="[[Russian Revolution (1917)|Russian Revolution|Russian Revolution]]]"

text="in the historical canon), are the anarchist territories during the [[Spanish Revolution of 1936|Spanish Revolution]] and "+\
     "the [[Free Territory]] during the [[Russian Revolution (1917)|Russian Revolution]].Through the efforts "+\
     "and influence of the [[Anarchism in Spain|Spanish anarchists]] during the Spanish ,詹,誠,词,遇,邱,郵,釋,鎭,閑,"
text=text+text
#text=["\nin the historical canon),", "\nare the anarchist territories during the \n[[Spanish Revolution of 1936|Spanish Revolution]] and ",
#      "the [[Free Territory]] during the [[Russian Revolution (1917)|Russian Revolution]].\nThrough the efforts ","and influence of the [[Anarchism in Spain|Spanish anarchists]] during the Spanish\n\n\n"]

In [None]:
#import fastai_sentencepiece

def extract_link_title(t:str) -> str:
    return re.sub('\[\[([^\]^|\[:]+)|([^\]\[:]+)\]\]', '\g<1>', t).replace(']]','')
    #return re.sub('\[\[(?:([^\]\[:]+))|([^\]\[:]+)\]\]', '\g<1>', t).replace('[','')
    #return re.sub('\[\[(?:([^\]\[:]+))|([^\]\[:]+)\]\]', '\g<1>', t)
def remove_first_empty_lines(t:str) -> str:
    return re.sub('^\n', '', t)

def keep_western(t:str) -> str:
    return ''.join(re.findall('([A-Za-z\d_ -/:-@\[-`{-~])',t))
    #return  re.sub('\', '', t)
                   
#\p{IsLatin}|[! -/:-@\[-`{-~]|[0-9]
                         
print(text)
print("------------") 
#for i,t in enumerate(text):
#    print(extract_link_title(t))
#    print(remove_first_empty_lines(t))
#%timeit re.sub('\[\[([^\]^|\[:]+)|([^\]\[:]+)\]\]', '\g<1>', text)
print(keep_western(text))
print("------------")



In [None]:
defaults.cpus

In [None]:
lang     = "en"
pathData = Path("../nlp-data")
path     = pathData / lang
pToks    = path/"wiki-train_valid"/"toks"


In [None]:
import re
#p = re.compile('([\r\n]+.?)+', r'\r\n')
p = re.compile("[\r\n]+.?")
txt = " 1\n\n\n\n\n"
p.sub("\n",txt)

In [None]:
imax = int(1e8)
def tuppel():
    for i in range(imax): 
        r = (i,i+1)
        l = r[1]-r[0]
%timeit tuppel()    

In [None]:
def simpleParams():
    for i in range(imax): 
        r0 = i
        r1 = i+1
        l  = r1-r0
%timeit simpleParams()        

In [None]:
lang     = "en"
pathData = Path("../nlp-data")
path     = pathData / lang
pTrain   = path/"wiki-train_valid"/"toks"/"train"

def loadAll(pTrain):
    for p in pTrain.glob("*.npy"):
        with p.open("rb") as f:
            np.load(f)
%time loadAll(pTrain)
#s=None
gc.collect()

In [None]:
np.load??

In [None]:
?np.load

In [None]:
%%timeit
lang     = "en"
pathData = Path("../nlp-data")
path     = pathData / lang
p        = path/"wiki-train_valid"/"toks"/"train"/"0-ids.npy"
    
a=None
gc.collect()

In [None]:
            import tracemalloc
            tracemalloc.start()
            snapshot1 = tracemalloc.take_snapshot()

            -----your interesting code
            
            snapshot2 = tracemalloc.take_snapshot()
            top_stats = snapshot2.compare_to(snapshot1, 'lineno')
            print(f"Top 10 of {len(top_stats)}")
            for stat in top_stats[:10]: print(stat)
          

In [None]:
data = buffer[:bs*3].reshape(bs,3)

In [None]:
-1%9

In [None]:
l=9
for i in range(2*l):print(f"i:{i} i-backwards:{l-1 - i%l}")

In [None]:
seq_len=3
a = np.arange(bs*seq_len,-1,-1)
ix = np.arange(len(a))
print(ix)
print(a)
ts[ix] = a[ix]
print(t.shape)
print(t)

In [None]:
t.numel()/3

In [None]:
bs=4
sl=3
bs_dim=t.numel()/sl
t.view(-1,14)

In [None]:
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

In [None]:
import asyncio
async def myfun():
    await asyncio.sleep(1)
    return 5

In [None]:
await myfun()
await myfun()


In [None]:
async def hello():
    await asyncio.sleep(1)
    return "hello"
async def test():
    loop = asyncio.get_event_loop()
    tasks = []
    for i in range(10):
        tasks.append(loop.create_task(hello()))
    # all the tasks will automatically run
    asyncio.set_event_loop(loop)
    rest = asyncio.gather(*tasks)
    await rest

In [None]:
#await test()

loop = asyncio.get_event_loop()
tasks = []
for i in range(10):
    tasks.append(loop.create_task(hello()))
# all the tasks will automatically run
asyncio.set_event_loop(loop)
rest = asyncio.gather(*tasks)
await rest

In [None]:
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool 

pool = ThreadPool(1) 
results = pool.map(test,None)
pool.close() 
pool.join()


In [None]:
        """
        async def hello():
            await asyncio.sleep(1)
            return "hello"
    
        fileblocks = partition_by_cores(files, n_cpus)
        print(f"fileblocks:{len(fileblocks)}")
        tasks = []
        i=0
        loop = asyncio.get_event_loop()
        for fb in fileblocks:
            print(f"task:{i}")
            i += 1
            results = tasks.append( loop.create_task( FileTokenizer.getIds_from_file(fb) ) )
        asyncio.set_event_loop(loop)
        results = asyncio.gather(*tasks)
        await results
        """


In [None]:
lang            = "en"
pathData       = Path("../nlp-data")
path           = pathData / lang
pathTrainValid = path/"wiki-train_valid"
pathNPY        = pathTrainValid/"dummy.npy"
pathNPZ        = pathTrainValid/"dummy.npz"


In [None]:
arrays = np.empty(10, dtype=object)
arrays

In [None]:
import io
a=np.arange(3,dtype=np.int32).astype(np.int32)
b=np.arange(5,dtype=np.int32).astype(np.int32)
c=np.arange(10,dtype=np.int32).astype(np.int32)


In [None]:
with pathNPY.open("wb") as f:
    np.save(f,a, allow_pickle=False, fix_imports=False)
    np.save(f,b, allow_pickle=False, fix_imports=False)
    np.save(f,c, allow_pickle=False, fix_imports=False)

with pathNPY.open("rb") as f:
    f.seek(0, 2); file_size = f.tell(); f.seek(0)
    while f.tell() != file_size:
        print(np.load(f))


In [None]:
arrays = np.empty(0, dtype=object)
arrays

In [None]:
arrays = [] #np.empty(3, dtype=object)
for i in range(3):
    #for i,a in enumerate(arrays):
    arrays.append( np.arange(i+1) )

#arrays2 = np.empty(3, dtype=object)
#arrays2[:3] = arrays[:3]
#arrays[1][0] = 10
print(arrays)

def save(arrays):
    with pathNPY.open("wb") as f:
        if isinstance(arrays,list): 
            arrays = np.asarray(arrays,dtype=object)
        np.save(f,arrays, allow_pickle=True, fix_imports=False)
        
def load():
    with pathNPY.open("rb") as f:
        arrays = np.load(f)
    return arrays

save(arrays)
load()

In [None]:
%timeit save(arrays)

In [None]:
%timeit load()
print(load())

In [None]:
arrays = [] #np.empty(0, dtype=object)
for i in range(3):
    a = load()
    if len(a) > 0: arrays.extend( a.tolist() )
arrays = np.asarray(arrays,dtype=object)    
print(len(arrays))
print(arrays)