In [1]:
import torch
import numpy as np

In [2]:
english_file = './data/english.en'
chinese_file = './data/chinese.zh'
eng_voc_file = './data/in_vocab.pkl'
ch_voc_file = './data/out_vocab.pkl'

PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'


In [3]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(chinese_file, 'r') as file:
    chinese_sentences = file.readlines()

TOTAL_SENTENCE = 100000
chinese_sentences = chinese_sentences[:TOTAL_SENTENCE]
english_sentences = english_sentences[:TOTAL_SENTENCE]
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
chinese_sentences = [sentence.rstrip('\n') for sentence in chinese_sentences]

chinese_sentences[:10]

['1929年还是1989年?',
 '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。',
 '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。',
 '如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。',
 '目前的趋势是，要么是过度的克制（欧洲），要么是努力的扩展（美国）。',
 '欧洲在避免债务和捍卫欧元的名义下正变得谨慎，而美国已经在许多方面行动起来，以利用这一理想的时机来实行急需的结构性改革。',
 '然而，作为地域战略学家，无论是从政治意义还是从经济意义上，让我自然想到的年份是1989年。',
 '当然，雷曼兄弟公司的倒闭和柏林墙的倒塌没有任何关系。',
 '事实上，从表面上看，两者似乎是完全是相反的：一个是象征着压抑和人为分裂的柏林墙的倒塌，而另一个是看似坚不可摧的并令人安心的金融资本主义机构的倒塌。',
 '然而，和1989年一样，2008-2009年很可能也能被视为一个划时代的改变，其带来的发人深省的后果将在几十年后仍能让我们感受得到。']

In [4]:
import sentencepiece as spm
import numpy as np

def english_tokenizer_load():
    sp_eng = spm.SentencePieceProcessor()
    sp_eng.Load('./eng.model')
    return sp_eng

def chinese_tokenizer_load():
    sp_chn = spm.SentencePieceProcessor()
    sp_chn.Load('./chn.model')
    return sp_chn

In [5]:
import torch
from torch.nn.utils.rnn import pad_sequence

sp_eng = english_tokenizer_load()
x = sp_eng.EncodeAsIds(english_sentences[:10])
y = pad_sequence([torch.LongTensor(np.array(l_)) for l_ in x], batch_first=True, padding_value=sp_eng.pad_id())
y.shape

torch.Size([10, 42])

In [14]:
def tokenize(sentences, tokenizer, max_sequence_length, start_token=True, end_token=True):
    bos_token = tokenizer.bos_id()
    eos_token = tokenizer.eos_id()
    pad_token = tokenizer.pad_id()
    tokenized_sentences = tokenizer.EncodeAsIds(sentences)
    if start_token:
        tokenized_sentences = [[bos_token] + sentence for sentence in tokenized_sentences]
    if end_token:
        tokenized_sentences = [sentence + [eos_token] for sentence in tokenized_sentences]
    for _ in range(len(tokenized_sentences[0]), max_sequence_length):
        tokenized_sentences[0].append(pad_token)
    return pad_sequence([torch.LongTensor(np.array(l_)) for l_ in tokenized_sentences], batch_first=True, padding_value=sp_eng.pad_id())

In [20]:
sp_chn.vocab_size()

32000

In [15]:
sp_chn = chinese_tokenizer_load()
t = tokenize(chinese_sentences[:10], sp_chn, 100)
t.shape

torch.Size([10, 100])

In [8]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, chinese_sentences):
        self.english_sentences = english_sentences
        self.chinese_sentences = chinese_sentences

    def __len__(self):
        assert len(english_sentences) == len(chinese_sentences), "different length"
        return len(self.english_sentences)
    
    def __getitem__(self, index):
        return self.english_sentences[index], self.chinese_sentences[index]

In [9]:
dataset = TextDataset(english_sentences, chinese_sentences)

In [10]:
len(dataset)

100000

In [11]:
dataset[1]

('PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.',
 '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。')

In [12]:
batch_size = 3
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [13]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('1929 or 1989?', 'PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.', 'At the start of the crisis, many people likened it to 1982 or 1973, which was reassuring, because both dates refer to classical cyclical downturns.'), ('1929年还是1989年?', '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。', '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。')]
[('Today, the mood is much grimmer, with references to 1929 and 1931 beginning to abound, even if some governments continue to behave as if the crisis was more classical than exceptional.', 'The tendency is either excessive restraint (Europe) or a diffusion of the effort (the United States).', 'Europe is being cautious in the name of avoiding debt and defending the euro, whereas the US has moved on many fronts in order not to waste an ideal opportunity to implement badly needed structural reforms.'), ('如今人们的心情却是沉重多了，许多人开始把这次危

In [16]:
batch

[('In 1989, liberal democracy triumphed over the socialist ideology incarnated and promoted by the Soviet Bloc.',
  'For many of his supporters, it was President Ronald Reagan who, with his deliberate escalation of the arms race, pushed the Soviet economy to the brink, thereby fully demonstrating the superiority of liberal societies and free markets.',
  'Of course, there are obvious differences between 1989 and now.'),
 ('1989年，自由民主战胜了由苏联集团具体化并推崇的社会主义意识形态。',
  '对于里根总统的许多的支持者来说，就是他精心策划的军备竞赛的升级，把苏联经济推向了崩溃的边缘，从而充分显示了自由社会和自由市场的优越性。',
  '当然，现在的情况和1989年的情况明显不同了。')]

In [17]:
NEG_INFTY = -1e9
max_sequence_length = 100
def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [18]:
create_masks(batch[0], batch[1])

encoder_self_attention_mask torch.Size([3, 100, 100]): tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
decoder_self_attention_mask torch.Size([3, 100, 100]): tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e

(tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]],
 
         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
    

In [21]:
import torch.nn as nn
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [69]:


class SentenceEmbedding(nn.Module):
    def __init__(self, max_sequence_length, d_model, tokenizer, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = tokenizer.vocab_size()
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.tokenizer = tokenizer
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def tokenize(self, sentences, start_token, end_token):
        sentences = list(sentences)
        bos_token = self.tokenizer.bos_id()
        eos_token = self.tokenizer.eos_id()
        pad_token = self.tokenizer.pad_id()
        tokenized_sentences = self.tokenizer.EncodeAsIds(sentences)
        if start_token:
            tokenized_sentences = [[bos_token] + sentence for sentence in tokenized_sentences]
        if end_token:
            tokenized_sentences = [sentence + [eos_token] for sentence in tokenized_sentences]
        for _ in range(len(tokenized_sentences[0]), self.max_sequence_length):
            tokenized_sentences[0].append(pad_token)
        print(len(tokenized_sentences[0]))
        return pad_sequence([torch.LongTensor(np.array(l_)) for l_ in tokenized_sentences], batch_first=True, padding_value=sp_eng.pad_id())
    
    def forward(self, x, start_token, end_token):
        x = self.tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder()
        x = self.dropout(x + pos)
        return x

In [70]:
se = SentenceEmbedding(1000, 512, tokenizer=sp_chn, START_TOKEN=True, END_TOKEN=True, PADDING_TOKEN=True)

In [67]:
batch

[('In 1989, liberal democracy triumphed over the socialist ideology incarnated and promoted by the Soviet Bloc.',
  'For many of his supporters, it was President Ronald Reagan who, with his deliberate escalation of the arms race, pushed the Soviet economy to the brink, thereby fully demonstrating the superiority of liberal societies and free markets.',
  'Of course, there are obvious differences between 1989 and now.'),
 ('1989年，自由民主战胜了由苏联集团具体化并推崇的社会主义意识形态。',
  '对于里根总统的许多的支持者来说，就是他精心策划的军备竞赛的升级，把苏联经济推向了崩溃的边缘，从而充分显示了自由社会和自由市场的优越性。',
  '当然，现在的情况和1989年的情况明显不同了。')]

In [62]:
a = list(batch[0])
len(a[1])

235

In [72]:
se(batch[0], True, True).shape

1000


torch.Size([3, 1000, 512])