In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm


# Bengio 2003
## A Neural Probabilistic Language Model

## The Data: Tiny Shakespeare

In [32]:
# Read in raw shakespeare text
text = open('tiny-shakespeare.txt','r').read()

vocab_size = 256
# Get Tokens (Character)
spm.SentencePieceTrainer.train(input='tiny-shakespeare.txt', vocab_size=vocab_size, model_type='char', model_prefix='mc')

# Get tokens (Word)
spm.SentencePieceTrainer.train(input='tiny-shakespeare.txt', vocab_size=vocab_size, model_type='word', model_prefix='mw')

# Get tokens (Unigram)
spm.SentencePieceTrainer.train(input='tiny-shakespeare.txt', vocab_size=vocab_size, model_type='unigram', model_prefix='mu')

# Get tokens (BPE)
spm.SentencePieceTrainer.train(input='tiny-shakespeare.txt', vocab_size=vocab_size, model_type='bpe', model_prefix='mb')

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: tiny-shakespeare.txt
  input_format: 
  model_prefix: mc
  model_type: CHAR
  vocab_size: 256
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clippin

In [33]:
# Language Model (Character)
spc = spm.SentencePieceProcessor('mc.model')

# Language Model (Word)
spw = spm.SentencePieceProcessor('mw.model')

# Language Model (Unigram)
spu = spm.SentencePieceProcessor('mu.model')

# Language Model (BPE)
spb = spm.SentencePieceProcessor('mb.model')


# Show text
print (spc.Encode(text[:100]))
print ('Character size: ', len(spc.Encode(text[:100])))
print ()
print (spw.Encode(text[:100]))
print ('Word size', len(spw.Encode(text[:100])))
print ()
print (spu.Encode(text[:100]))
print ('Unigram size', len(spu.Encode(text[:100])))
print ()
print (spb.Encode(text[:100]))
print ('BPE size', len(spb.Encode(text[:100])))


[3, 51, 12, 10, 9, 5, 3, 39, 12, 5, 12, 59, 4, 11, 26, 3, 45, 4, 20, 6, 10, 4, 3, 19, 4, 3, 25, 10, 6, 21, 4, 4, 14, 3, 7, 11, 17, 3, 20, 15, 10, 5, 8, 4, 10, 18, 3, 8, 4, 7, 10, 3, 16, 4, 3, 9, 25, 4, 7, 30, 27, 3, 28, 13, 13, 26, 3, 37, 25, 4, 7, 30, 18, 3, 9, 25, 4, 7, 30, 27, 3, 51, 12, 10, 9, 5, 3, 39, 12, 5, 12, 59, 4, 11, 26, 3, 52, 6, 15]
Character size:  99

[111, 247, 0, 40, 0, 144, 0, 149, 25, 0, 111, 247, 76]
Word size 13

[3, 137, 91, 25, 119, 90, 15, 249, 56, 11, 99, 4, 158, 4, 92, 195, 68, 67, 3, 44, 13, 42, 60, 150, 6, 50, 21, 65, 241, 16, 75, 57, 11, 82, 20, 4, 10, 49, 6, 241, 16, 3, 137, 91, 25, 119, 90, 15, 249, 56, 11, 190]
Unigram size 52

[103, 63, 42, 83, 22, 206, 253, 30, 220, 67, 198, 214, 128, 84, 37, 129, 59, 65, 166, 211, 18, 120, 199, 90, 212, 41, 32, 68, 172, 145, 224, 221, 40, 21, 220, 70, 219, 145, 224, 212, 172, 145, 224, 221, 103, 63, 42, 83, 22, 206, 253, 30, 220, 138, 6]
BPE size 55


In [105]:
class Shakepeare(Dataset):
    def __init__(self, text, block_size) -> None:
        super().__init__()
        self.text = text
        self.block_size = block_size
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        #ix = torch.randint(self.randlen)
        x = self.text[index:index+self.block_size]
        y = self.text[index + 1:index+self.block_size + 1]
        return x, y

data = Shakepeare(torch.tensor(spb.Encode(text)), block_size=8)
data_dl = DataLoader(data, batch_size=4, shuffle=True)

for batch in data_dl:
    x, y = batch 
    print (x)
    print (y)
    print ()
    dx = spb.Decode(x.tolist())
    dy = spb.Decode(y.tolist())

    for wx, wy in zip(dx, dy):
        print (f'x ={wx:>25s} // y = {wy:>25s}')
    break


tensor([[226, 208,  45, 177,  53, 123, 220,  38],
        [130, 200, 212,  80,  13, 205,  84,   7],
        [ 10, 203, 210,  49,  25, 101,  35, 152],
        [ 98, 201, 218, 218,  47,  98, 198, 198]])
tensor([[208,  45, 177,  53, 123, 220,  38, 204],
        [200, 212,  80,  13, 205,  84,   7, 202],
        [203, 210,  49,  25, 101,  35, 152, 198],
        [201, 218, 218,  47,  98, 198, 198, 207]])

x =    'd and shall be so: T // y =     d and shall be so: Tr
x =        No, for then we s // y =         o, for then we sh
x =        insman come to se // y =          sman come to see
x =            stabbing stee // y =              abbing steel


['talk of pride', 'go with him; And go', 'n. JULIE', 't! By and by. I']
['alk of pride:', 'o with him; And go,', '. JULIET', '! By and by. I h']


In [75]:
spb.Decode(x[0].item())

ValueError: only one element tensors can be converted to Python scalars

In [85]:
spb.Decode(x.tolist())

['my foe. FRI', 'Come, lords, a', 'l iron arms', 'framed, but force']