In [2]:
a = False
b = False
a | b

False

In [4]:
import torch
import torch.nn as nn
import numpy as np

class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.word2idx = {word: idx for idx, word in enumerate(vocab)}
        self.idx2word = {idx: word for idx, word in enumerate(vocab)}

    def tokenize(self, text):
        return [self.word2idx[word] for word in text.split()]

    def detokenize(self, tokens):
        return ' '.join([self.idx2word[token] for token in tokens])

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.nn.functional.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_layer = nn.Linear(d_model, d_model)
        self.attention = ScaledDotProductAttention()

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query, key, value = [l(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]
        x, attn = self.attention(query, key, value, mask)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.output_layer(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layer_norms = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(2)])

    def forward(self, x, mask=None):
        x = self.layer_norms[0](x + self.self_attn(x, x, x, mask))
        return self.layer_norms[1](x + self.feed_forward(x))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layer_norms = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(3)])

    def forward(self, x, memory, tgt_mask=None, memory_mask=None):
        x = self.layer_norms[0](x + self.self_attn(x, x, x, tgt_mask))
        x = self.layer_norms[1](x + self.cross_attn(x, memory, memory, memory_mask))
        return self.layer_norms[2](x + self.feed_forward(x))

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_len=5000):
        super(Encoder, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.positional_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])

    def forward(self, src, mask=None):
        x = self.positional_enc(self.token_emb(src))
        for layer in self.layers:
            x = layer(x, mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_len=5000):
        super(Decoder, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.positional_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        x = self.positional_enc(self.token_emb(tgt))
        for layer in self.layers:
            x = layer(x, memory, tgt_mask, memory_mask)
        return x

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, vocab_size):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.generator = nn.Linear(encoder.layers[0].self_attn.d_model, vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        memory = self.encoder(src, src_mask)
        output = self.decoder(tgt, memory, tgt_mask, src_mask)
        return self.generator(output)

# Example usage
vocab = ['<pad>', '<sos>', '<eos>', 'hello', 'world']
tokenizer = Tokenizer(vocab)
vocab_size = len(vocab)
d_model = 512
num_heads = 8
d_ff = 2048
num_layers = 6
max_len = 10

encoder = Encoder(vocab_size, d_model, num_heads, d_ff, num_layers, max_len)
decoder = Decoder(vocab_size, d_model, num_heads, d_ff, num_layers, max_len)
model = Seq2Seq(encoder, decoder, vocab_size)

src_sentence = "hello world"
tgt_sentence = "<sos> world hello <eos>"

src_tokens = tokenizer.tokenize(src_sentence)
tgt_tokens = tokenizer.tokenize(tgt_sentence)

src_tensor = torch.tensor(src_tokens).unsqueeze(0)
tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0)

output = model(src_tensor, tgt_tensor[:, :-1])
print(output)


tensor([[[-0.6691,  0.2654,  0.3564, -0.8418, -0.2603],
         [-0.9283,  0.0884, -0.0517, -0.2380, -0.6756],
         [-0.3518,  0.2873, -0.2232, -0.9393, -0.2547]]],
       grad_fn=<ViewBackward0>)
