In [1]:
import os
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
#from fairseq.data.dictionary import Dictionary
from tqdm import tqdm
from copy import deepcopy

class IMDbDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.precompute()

    def precompute(self):
        self.sample_files = []
        dirs = ['pos', 'neg', 'unsup']
        for _dir in dirs:
            path = os.path.join(self.path, _dir)
            for root, dirs, files in os.walk(path, topdown=False):
               for name in files:
                   fpath = os.path.join(root, name)
                   self.sample_files.append(fpath)
        self.length = len(self.sample_files)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        fpath = self.sample_files[idx]
        with open(fpath) as fp:
            contents = fp.read()
            ignores = ['<br>', '<br/>', '<br />']
            for ignore in ignores:
                contents = contents.replace(ignore, '')
        return contents
    


In [None]:
import os
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
#from fairseq.data.dictionary import Dictionary
from tqdm import tqdm
from copy import deepcopy
from .imdb_enhanced import IMDbEnhancedDataset
from .imdb_dataset import IMDbDataset
from .vocab_builder import VocabBuilder

class TensorIMDbDataset(Dataset):
    def __init__(self, path, tokenizer, mask_builder, truncate_length, vocab=None):
        self.path = path
        self._dataset = IMDbEnhancedDataset(path, tokenizer, truncate_length)
        self.mask_builder = mask_builder
        self.tokenizer = tokenizer
        self.truncate_length = truncate_length
        self.vocab = vocab
        self._construct_vocabulary()

    def _construct_vocabulary(self):
        if self.vocab is None:
            raw_dataset = IMDbDataset(self.path)
            builder = VocabBuilder(raw_dataset, self.tokenizer, self.path)
            self.vocab = builder.vocab()

    def __len__(self):
        return len(self._dataset)

    def __getitem__(self, idx):
        tokens = self._dataset[idx]
        sequence_length = min(self.truncate_length, len(tokens))
        mask_idxs = self.mask_builder(sequence_length)
        tokens = tokens[:sequence_length]

        def get_pair(tokens, mask_idxs, mask_id):
            idxs = [self.vocab.index(token) for token in tokens]

            def _pad(ls, desired_length, pad_index):
                padded_ls = deepcopy(ls)
                while len(padded_ls) <= desired_length:
                    padded_ls.append(pad_index)
                return padded_ls

            srcs = deepcopy(idxs)
            srcs.append(self.vocab.eos())

            tgts = deepcopy(idxs)
            tgts.insert(0, self.vocab.eos())

            srcs = _pad(srcs, self.truncate_length, self.vocab.pad())
            tgts = _pad(tgts, self.truncate_length, self.vocab.pad())

            mask = torch.zeros(len(tgts))
            for mask_idx in mask_idxs:
                offset = 1 # For eos
                mask[mask_idx + offset] = 1
                srcs[mask_idx] = mask_id

            return (srcs, tgts, len(srcs), mask)

        mask_id = self.vocab.index(self.mask_builder.mask_token)
        return get_pair(tokens, mask_idxs, mask_id)


    def get_collate_fn(self):
        return TensorIMDbDataset.collate

    @staticmethod
    def collate(samples):
        srcs, tgts, lengths, masks = list(zip(*samples))

        srcs = torch.LongTensor(srcs)
        tgts = torch.LongTensor(tgts)

        lengths = torch.LongTensor(lengths)
        lengths, sort_order = lengths.sort(descending=True)
        
        def _rearrange(tensor):
            return tensor.index_select(0, sort_order)

        srcs  = _rearrange(pad_sequence(srcs, batch_first=True))
        tgts  = _rearrange(pad_sequence(tgts, batch_first=True))
        masks = _rearrange(torch.stack(masks, dim=0))

        return (srcs, tgts, lengths, masks)

In [3]:
from torch import nn
import torch
import sentencepiece as spm

class Tokenizer(nn.Module):
    pass

class SpaceTokenizer(Tokenizer):
    def forward(self, seq):
        return seq.split()


class SentencePieceTokenizer:
    def __init__(self, model_prefix):
        self.prefix = model_prefix

        self.path = {}
        for key in ['model', 'vocab']:
            self.path[key] = '{}.{}'.format(self.prefix, key)

        self.sp = spm.SentencePieceProcessor() 
        self.sp.Load(self.path['model'])

        # Build vocabulary.
        self.build_vocabulary()

    def build_vocabulary(self):
        self.vocab = set()
        for line in open(self.path['vocab']):
            word, score = line.strip().split()
            self.vocab.add(word)


    def __call__(self, text):
        tokens = self.sp.EncodeAsPieces(text)

        to_utf = lambda x: x.decode("utf-8") 
        stokens = list(map(to_utf, tokens))

        wanted = lambda s: s in self.vocab
        stokens = list(filter(wanted, stokens))
        return stokens


In [8]:
args = {'num_rollouts' : 1, 'path' : 'data/aclImdb_v1.tar.gz', 'max_epochs' : 10, 'validate_every' : 5, 'num_rollouts' : 5, 'criterion' : 'dummy', 'spm_prefix' : 'data/aclImdb/train/imdb'}

In [9]:
def generate_mask(sequence_length, batch_size=None, is_present=0.7):
    """
    e.g.
    returns: [1, 1, 0, 1, 0, 1]
    """
    if batch_size is not None:
        mask = np.random.binomial(1, is_present, size=(batch_size, sequence_length))
    elif batch_size is None:
        mask = np.random.binomial(1, is_present, size=(sequence_length,))
    return torch.from_numpy(mask).long()

class Mask:
    mask_token = '__<m>__'
    def __call__(self, n):
        idxs = self.forward(n)

        # Verify indices are okay.
        assert ( len(idxs) < n)
        valid_set = set(list(range(n)))
        for i in idxs:
            assert(i in valid_set)

        return idxs
    
class ContiguousRandom(Mask):
    def __init__(self, n_chars):
        super().__init__()
        self.n_chars = n_chars
        self.r = random.Random(42)

    def forward(self, n):
        n_chars = self.n_chars
        start = self.r.randint(1, n-n_chars-1)
        assert ( start + n_chars <= n)
        idxs = []
        for i in range(start, start+n_chars):
            idxs.append(i)
        return idxs

In [10]:
spm_tokenize = SentencePieceTokenizer(args['spm_prefix'])

# Compute Batch Size
max_tokens_per_device = 48000
# max_tokens_per_device = 1000
n_devices = torch.cuda.device_count()
max_tokens = max_tokens_per_device * n_devices
truncate_length = 20
batch_size = int(max_tokens/truncate_length)

#checkpoint_path = "/home/jerin/mgan-attempts/"
#saver = Saver(checkpoint_path)

train_path = os.path.join(args['path'], 'train')
dev_path = os.path.join(args['path'], 'test')

train_dataset = TensorIMDbDataset(
        train_path, spm_tokenize, 
        rmask, truncate_length
)

# Constructed vocabulary from train
vocab = train_dataset.vocab
Task = namedtuple('Task', 'source_dictionary target_dictionary')
task = Task(source_dictionary=vocab, 
        target_dictionary=vocab)

trainer = MGANTrainer(args, task, saver, visdom, vocab)
def loader(dataset):
    _loader = DataLoader(dataset, batch_size=batch_size, 
            collate_fn=TensorIMDbDataset.collate, 
            shuffle=True, num_workers=8)
    return _loader

#trainer.validate_dataset(loader(train_dataset))

dev_dataset = TensorIMDbDataset(
        dev_path, spm_tokenize, 
        rmask, truncate_length,
        vocab 
)

Datasets = namedtuple('Dataset', 'train dev')
datasets = Datasets(
        train=train_dataset,
        dev=dev_dataset
)

RuntimeError: Internal: /sentencepiece/src/sentencepiece_processor.cc(73) [model_proto->ParseFromArray(serialized.data(), serialized.size())] 

In [3]:
from torch.utils.data import Dataset, DataLoader