# Step 1: Tokenizer and Vocabulary Creation (Add more words if needed)

## 1. Import Necessary Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import random
random.seed(36)  # Python random seed
torch.manual_seed(36)  # PyTorch seed (CPU)

<torch._C.Generator at 0x7f2e345838f0>

In [3]:
# Step 1: Tokenizer and Vocabulary Creation (Add more words if needed)
sentence_en = "I love Dhoni ."
# sentence_fr = "J' adore l'IA ."
sentence_fr = "నాకు ధోని అంటే ఇష్టం ."

word_map_en = {"<pad>": 0, "I": 1, "love": 2, "Dhoni": 3, ".": 4}
word_map_fr = {"<pad>": 0, "నాకు": 1, "ధోని": 2, "అంటే": 3, "ఇష్టం": 4, ".": 5}

## 2. Tokenizing sentences

In [4]:
# Tokenizing sentences
def tokenize(sentence, word_map):
    return torch.tensor([word_map[word] for word in sentence.split()])

## 3.Tokenize the input and target sentences

In [5]:
# Tokenize the input and target sentences
input_tensor = tokenize(sentence_en, word_map_en).unsqueeze(0)  # Shape (1, 6)
target_tensor = tokenize(sentence_fr, word_map_fr).unsqueeze(0)  # Shape (1, 6)

# Step 2: Positional Encoding

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.PE = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        i = torch.arange(0, d_model, 2).float()
        denominator = torch.pow(10000, 2*i/d_model)
        self.PE[:, 0::2] = torch.sin(position * denominator)
        self.PE[:, 1::2] = torch.cos(position * denominator)
        self.PE = self.PE.unsqueeze(0)

    def forward(self, x):
        return x + self.PE[:, :x.size(1)]

# Step 3: Multi-Head Attention

In [7]:
# Step 3: Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        
        self.fc = nn.Linear(d_model, d_model)
        
    def forward(self, q_input, k_input, v_input, mask=None):

        batch_size, max_sequence_length, _ = q_input.size()
        
        Q = self.query(q_input)
        K = self.key(k_input)
        V = self.value(v_input)
        
        q = Q.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        k = K.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        v = V.reshape(batch_size, max_sequence_length, self.num_heads, self.d_v)
     
        q = q.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        k = k.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        v = v.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_v]
       
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))  # Ensure mask matches input length
        attn_weights = F.softmax(attn_scores, dim=-1)
    
        attention_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, max_sequence_length, self.d_model)
       
        output = self.fc(attention_output)
        
        return output

# Step 4: Feed-Forward Network

In [8]:
# Step 4: Feed-Forward Network
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
       
        return x

# Step 5: Encoder Layer

In [9]:
# Step 5: Encoder Layer
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        
        super(EncoderLayer, self).__init__()
        self.multihead_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.feedforward = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout2 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, mask=None):
        
        
        attn_output = self.multihead_attn(x, x, x, mask=None)
        attn_output = self.dropout1(attn_output)
        x = self.norm1(attn_output + x)
        
        ff_output = self.feedforward(x)
        ff_output = self.dropout2(ff_output)
        enc_out = self.norm2(ff_output + x)
        
        return enc_out

In [10]:
class Encoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        super().__init__()
        self.layers = nn.Sequential(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                     for _ in range(num_layers)])

    def forward(self, x):
        x = self.layers(x)
        return x

# Step 6: Decoder Layer

In [11]:
# Step 6: Decoder Layer
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.masked_multihead_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(p=drop_prob)
       
        self.encoder_decoder_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
       
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout2 = nn.Dropout(p=drop_prob)
        self.feedforward = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout3 = nn.Dropout(p=drop_prob)
        

    def forward(self, x, enc_out_k, enc_out_v, decoder_mask):
        attn_output1 = self.masked_multihead_attn(x, x, x, mask=decoder_mask) 
        attn_output1 = self.dropout1(attn_output1) 
        x = self.norm1(attn_output1 + x) 

        attn_output2 = self.encoder_decoder_attention(x, enc_out_k, enc_out_v, mask=None) 
        attn_output2 = self.dropout2(attn_output2)
        x = self.norm2(attn_output2 + x) 

        ff_output = self.feedforward(x) 
        ff_output = self.dropout3(ff_output) 
        dec_out = self.norm3(ff_output + x) 
        
        return dec_out

In [12]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, enc_out_k, enc_out_v, mask = inputs
        for module in self._modules.values():
            y = module(x, enc_out_k, enc_out_v, mask) #30 x 200 x 512
        return y

In [13]:
class Decoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        super().__init__()
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                     for _ in range(num_layers)])

    def forward(self, x, enc_out_k, enc_out_v, mask):
        x = self.layers(x, enc_out_k, enc_out_v, mask)
        return x

# Step 7: Transformer Model

In [14]:
# Step 7: Transformer Model
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ffn_hidden, num_encoder_layers, num_decoder_layers, drop_prob=0.2, max_len=5000):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_encoder_layers)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_decoder_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, tgt_mask=None):
        src = self.pos_encoder(self.embedding(src))
        tgt = self.pos_encoder(self.embedding(tgt))

        encoder_output =  self.encoder(src)
        decoder_output = self.decoder(tgt, encoder_output, encoder_output, tgt_mask)
        
        
        output = self.fc_out(decoder_output)
        return output

# Step 8: Prediction Module

In [15]:
# Step 8: Prediction Module
def translate(input_sentence, word_map_en, word_map_fr, transformer):
    # Tokenize input sentence
    input_tensor = tokenize(input_sentence, word_map_en).unsqueeze(0)  # Shape (1, seq_len)

    # Generate a mask for the target sentence
    tgt_mask = torch.tril(torch.ones((input_tensor.size(1), input_tensor.size(1)))).unsqueeze(0).unsqueeze(0)  # Lower triangular mask

    # Initialize a tensor for the target sentence
    target_tensor = torch.zeros((1, input_tensor.size(1)), dtype=torch.long)

    # Predict the output sentence (translation)
    output = transformer(input_tensor, target_tensor, tgt_mask)

    # Apply Softmax to get probabilities
    softmax_output = F.softmax(output, dim=-1)

    # Get predicted token indices (Argmax)
    predicted_tokens = torch.argmax(softmax_output, dim=-1)

    # Convert predicted tokens back to words
    reverse_word_map_fr = {v: k for k, v in word_map_fr.items()}
    translated_sentence = [reverse_word_map_fr[token.item()] for token in predicted_tokens[0] if token != 0]

    return " ".join(translated_sentence)

# Step 9: Initialize Model

In [16]:
# Step 9: Initialize Model
vocab_size_en = len(word_map_en)
vocab_size_fr = len(word_map_fr)
d_model = 128
num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2
ffn_hidden = 32
drop_prob=0.2 
max_len=500

transformer = Transformer(vocab_size_en, d_model, num_heads, ffn_hidden, num_encoder_layers, num_decoder_layers, drop_prob, max_len)

# Step 10: Take Input Sentence and Generate Translation

In [17]:
# Step 10: Take Input Sentence and Generate Translation
input_sentence = "I love Dhoni ."  # Example input sentence
predicted_sentence = translate(input_sentence, word_map_en, word_map_fr, transformer)
print("Input Sentence:", input_sentence)
print("Predicted Translation:", predicted_sentence)

Input Sentence: I love Dhoni .
Predicted Translation: అంటే ఇష్టం ఇష్టం ఇష్టం
