In [1]:
#hide
%reload_ext autoreload
%autoreload 2

from fastai2.basics import *
from fastai2.text.all import *
from fastai2.callback.all import *

from transformers import AutoModelWithLMHead, AutoModel, AutoModelForPreTraining, AutoTokenizer, AutoConfig

import tqdm
import json, pickle

In [2]:
# import pickle
# TODO: clean lines (words longer then 10)
# offsets = []
# with open("he.txt", "r") as file:
#     while True:
#         line = file.readline()
#         if not line:
#             break
#         offsets.append(file.tell())
#         if len(offsets) % 10000 == 0:
#             print(len(offsets))
# # pickle.dump(offsets, open("offsets.pkl", "wb"))


# lines_offsets = random.choices(offsets, k=1000000)
# lines_offsets = sorted(lines_offsets)
# create_dict(lines_offsets)

In [3]:
class OscarDataset(torch.utils.data.Dataset):
    def __init__(self, indexs):
        self.indexs = indexs
        self.file = open("he.txt", "r")
        
    def __len__(self):
        return len(self.indexs)
    
    def __getitem__(self, idx):
        return self.get_line(idx)
    
    
    def get_line(self, idx): 
        point = self.indexs[idx]
        self.file.seek(point)
        line = self.file.readline()
        return line
    
import sentencepiece
import collections
import os
def create_dict(lines_offsets, size_v=5000, name="m"):
    lines = [get_line(offset) for offset in tqdm.tqdm_notebook(lines_offsets)]
    if os.path.isfile("file.txt"): os.remove("file.txt")
    with open(f"file.txt", "w") as f:
        for w in lines:
            f.writelines(w + "\n")

    sentencepiece.SentencePieceTrainer.Train(f'--input=file.txt --model_prefix={name} --vocab_size={size_v}')


def get_vocab(name = "row.dic", path = None):
    spm = sentencepiece.SentencePieceProcessor()
    spm.file_path = name
    spm.Load(spm.file_path)
    spm.words = [spm.id_to_piece(id) for id in range(spm.get_piece_size())]
    spm.vocab = collections.defaultdict(int,{v:k for k,v in enumerate(spm.words)})         
    spm.words_len = len(spm.words)
    return spm

def get_oscar(bs=4):
    train_idx = offsets[:int(len(offsets) * 0.9)]
    valid_idx = offsets[int(len(offsets) * 0.9):]
    train_ds = OscarDataset(train_idx)
    valid_ds = OscarDataset(valid_idx)
    train_dl = DataLoader(train_ds, batch_size=bs, after_batch=collate_fn)
    valid_dl = DataLoader(valid_ds, batch_size=bs, after_batch=collate_fn)
    return train_dl, valid_dl

In [4]:
def get_line(point): 
    file.seek(point)
    line = file.readline().replace("\n", "")
    return line

def get_words_by_len():
    file = open("he.txt")
    lines_offsets = random.choices(offsets, k=100000)
    lines_offsets = sorted(lines_offsets)
    lines = [get_line(offset) for offset in tqdm.tqdm_notebook(lines_offsets)]
    file.close()
    words = [j for l in lines for j in l.split(" ")]
    words_count = Counter(words).most_common()
    words = [i[0] for i in words_count if i[1] > 5]
    words = [i for i in words if len(i) < 10]
    words_by_len = {}
    for w in words:
        subwords = spm.encode_as_pieces(w)
        words_by_len.setdefault(len(subwords), []).append(w)
    return words_by_len

# pickle.dump(words_by_len, open("words_by_len.pkl", "wb"))

In [5]:
spm = get_vocab("./sp_heb.model")
offsets = pickle.load(open("offsets.pkl", "rb"))
words_by_len = pickle.load(open("words_by_len.pkl", "rb"))

In [13]:

def pad_lists(lists, pad_value = 0):
    num_of_list = len(lists)
    max_len = max([len(i) for i in lists])
    pad_tensor  = torch.zeros(num_of_list, max_len).long() + pad_value
    for n, t in enumerate(lists):
        pad_tensor[n, :len(t)] = torch.tensor(t)
    return pad_tensor

In [15]:
sys.path.append("/home/ubuntu/bert/")
from models import Config, Transformer
c = Config(vocab_size = spm.words_len, dim=100, n_layers=4, n_heads=4)

class Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.Transformer = Transformer(cfg)
        self.layer = nn.Linear(cfg.dim, cfg.vocab_size)

    def forward(self, x):
        x = self.Transformer(x)
        x = self.layer(x)
        return x

m = Model(c)

In [41]:
a = list(range(10))

6

In [47]:
def create_input_output_from_lines(lines, max_len=512):
    input_subwords, target_subword, mask = [], [], []
    for line in lines:
        line_input_subwords, line_target_subwords, line_mask = [], [], []
        words = line.split(" ")
# TODO: shift in some way
#         if len(words) > max_len:
#             idx = random.randint(0, len(words) - max_len)
#             words = words[idx:idx + max_len]
        for word in words:
            word_subword = spm.encode_as_ids(word)
            if len(line_target_subwords) + len(word_subword) > max_len:
                continue
            line_target_subwords.extend(word_subword)
            r = random.random()
            if r < 0.15 and len(word_subword) <= 10:
                if r < 0.12: # mask
                    line_input_subwords.extend([2] * len(word_subword)) 
                elif r < 0.135: # replace
                    random_word = random.choice(words_by_len[len(word_subword)])
                    line_input_subwords.extend(spm.encode_as_ids(random_word)) 
                else: # keep
                    line_input_subwords.extend(word_subword)
                line_mask.extend([1] * len(word_subword)) 
            else:
                line_input_subwords.extend(word_subword)
                line_mask.extend([0] * len(word_subword)) 
        input_subwords.append(line_input_subwords)
        target_subword.append(line_target_subwords)
        mask.append(line_mask)
    return input_subwords, target_subword, mask

def collate_fn(lines):
    input_subwords, target_subword, mask = create_input_output_from_lines(lines)    
    input_tensor = pad_lists(input_subwords, 1)
    target_tensor = pad_lists(target_subword, 1)
    mask_tensor = pad_lists(mask)
    return input_tensor, [target_tensor, mask_tensor]

In [48]:
tdl, vdl = get_oscar(16)
dls = DataLoaders(*[tdl, vdl])
for (x,y) in tdl:
    print(x.size())
    r = m(x)
    _ = loss_func(r, y)
    _ = acc_func(r, y)

torch.Size([16, 154])
torch.Size([16, 373])
torch.Size([16, 95])
torch.Size([16, 243])
torch.Size([16, 185])
torch.Size([16, 191])
torch.Size([16, 161])
torch.Size([16, 186])
torch.Size([16, 246])
torch.Size([16, 336])
torch.Size([16, 189])
torch.Size([16, 329])
torch.Size([16, 254])
torch.Size([16, 512])
torch.Size([16, 251])
torch.Size([16, 178])
torch.Size([16, 156])
torch.Size([16, 137])
torch.Size([16, 151])
torch.Size([16, 161])
torch.Size([16, 159])
torch.Size([16, 264])
torch.Size([16, 512])
torch.Size([16, 121])
torch.Size([16, 180])
torch.Size([16, 174])
torch.Size([16, 178])
torch.Size([16, 353])
torch.Size([16, 194])
torch.Size([16, 168])
torch.Size([16, 450])
torch.Size([16, 163])
torch.Size([16, 148])
torch.Size([16, 265])
torch.Size([16, 277])
torch.Size([16, 295])
torch.Size([16, 444])
torch.Size([16, 273])
torch.Size([16, 288])
torch.Size([16, 384])
torch.Size([16, 61])
torch.Size([16, 198])
torch.Size([16, 458])
torch.Size([16, 264])
torch.Size([16, 330])
torch.Size([

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-48-94876011a1be>", line 5, in <module>
    r = m(x)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-15-72cb819d11ab>", line 12, in forward
    x = self.Transformer(x)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/bert/models.py", line 154, in forward
    h = block(h, mask)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubu

KeyboardInterrupt: 

In [9]:
def loss_func(output_tensor, target):
    target_tensor, target_mask = target
    return F.cross_entropy(output_tensor[target_mask == 1],
                           target_tensor[target_mask == 1])

def acc_func(output_tensor, target):
    target_tensor, target_mask = target
    return accuracy(output_tensor[target_mask == 1],
                           target_tensor[target_mask == 1].float())

In [12]:
learner = Learner(dls, 
                  m,
                  loss_func=loss_func,
                  metrics=[acc_func])

In [None]:
learner.fit_one_cycle(1)

epoch,train_loss,valid_loss,acc_func,time
