In [None]:
!pip install datasets==2.8.0

In [None]:
# https://github.com/anoopkunchukuttan/crowd-indic-transliteration-data/blob/master/crowd_transliterations.hi-en.txt
from datasets import load_dataset
dataset = load_dataset("ai4bharat/Aksharantar",split='train',data_files=['hin.zip'], streaming=True)
ds = list(dataset.take(5000))
# ds = list(dataset)
ds[:2]

Downloading readme:   0%|          | 0.00/8.39k [00:00<?, ?B/s]



[{'unique_identifier': 'hin1',
  'native word': 'मैट्रोलॉजिस्ट',
  'english word': 'maitrologist',
  'source': 'AK-Freq'},
 {'unique_identifier': 'hin2',
  'native word': 'पीएचडब्ल्यूसीएस',
  'english word': 'phwcs',
  'source': 'AK-Freq'}]

In [None]:
len(ds)

5000

In [None]:
hin_words = [i['native word'] for i in ds]
hin_vocab = set(list("".join(hin_words)))
print(len(list(hin_vocab)))
eng_words = [i['english word'] for i in ds]
eng_vocab = set(list("".join(eng_words)))
print(len(list(eng_vocab)))

64
26


In [None]:
PAD_IDX , BOS_IDX, EOS_IDX = 0, 1, 2
num_of_special_tokens = 3
stoi_input = {}
stoi_input['#'] = PAD_IDX
stoi_input['@'] = BOS_IDX
stoi_input['$'] = EOS_IDX
stoi_input.update({s:i+num_of_special_tokens for i,s in enumerate(hin_vocab)})
itos_input = {v:k for k,v in stoi_input.items()}

stoi_output = {}
stoi_output['#'] = PAD_IDX
stoi_output['@'] = BOS_IDX
stoi_output['$'] = EOS_IDX
stoi_output.update({s:i+num_of_special_tokens for i,s in enumerate(eng_vocab)})
itos_output = {v:k for k,v in stoi_output.items()}

In [None]:
len(stoi_input),len(itos_input),len(stoi_output), len(itos_output)

(67, 67, 29, 29)

In [None]:
def encode_input(txt):
  out = []
  for ch in txt:
    out.append(stoi_input[ch])
  return out
def decode_input(ids):
  out = []
  for id in ids:
    out.append(itos_input[id])
  return "".join(out)
def encode_output(txt):
  out = []
  for ch in txt:
    out.append(stoi_output[ch])
  return out
def decode_output(ids):
  out = []
  for id in ids:
    out.append(itos_output[id])
  return "".join(out)
encoded_input_ids = encode_input("मैट्रोलॉजिस्ट")
print(encoded_input_ids)
decoded_input = decode_input(encoded_input_ids)
print(decoded_input)
encoded_output_ids = encode_output("maitrologist")
print(encoded_output_ids)
decoded_output = decode_output(encoded_output_ids)
print(decoded_output)

[66, 58, 34, 38, 32, 63, 59, 62, 23, 7, 39, 38, 34]
मैट्रोलॉजिस्ट
[21, 3, 18, 12, 17, 15, 7, 15, 23, 18, 22, 12]
maitrologist


In [None]:
# x = [encode_input("@" + i['native word'] + "$") for i in ds]
# y = [encode_output("@" + i['english word'] + "$") for i in ds]
# x[0], y[0], decode_input(x[0]), decode_output(y[0])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import DataLoader, Dataset

# Custom Dataset class
class TextToTextDataset(Dataset):
    def __init__(self, input_data, target_data):
        self.input_data = input_data
        self.target_data = target_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        input_word = self.input_data[idx]
        target_word = self.target_data[idx]

        input_indices = torch.tensor(encode_input(input_word))
        target_indices = torch.tensor(encode_output(target_word))

        return input_indices, target_indices

# Pad sequences in a batch
def collate_fn(batch):
    input_seqs, target_seqs = zip(*batch)
    # Pad input sequences
    input_lengths = torch.tensor([len(seq) for seq in input_seqs])
    max_input_length = max(input_lengths)
    padded_input_seqs = nn.utils.rnn.pad_sequence(input_seqs, batch_first=True, padding_value=0)

    # Pad target sequences
    target_lengths = torch.tensor([len(seq) for seq in target_seqs])
    max_target_length = max(target_lengths)
    padded_target_seqs = nn.utils.rnn.pad_sequence(target_seqs, batch_first=True, padding_value=0)
    # target_seqs = torch.stack(target_seqs,dim=0)

    return padded_input_seqs, padded_target_seqs

# Create train, validation, and test datasets
train_input_data = ["@"+i['native word']+"$" for i in ds]
train_target_data = ["@"+i['english word']+"$" for i in ds]

# Create train, validation, and test datasets
train_dataset = TextToTextDataset(train_input_data, train_target_data)
# valid_dataset = TextToTextDataset(valid_input_data, train_target_data)
# test_dataset = TextToTextDataset(test_input_data, train_target_data)

# Create DataLoader for each dataset with padding
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
for batch_input, batch_target in train_dataloader:
  for i in range(batch_size):
    print(decode_input(batch_input[i].tolist()), decode_output(batch_target[i].tolist()))
  break

@लॉवेल$############## @lowell$#################
@किंगखान$############ @kingkhan$###############
@गौर$################ @gaur$###################
@सुरभि$############## @surabhi$################
@ब्रह्मचर्चाओं$###### @bhramacharchaon$########
@दमघोंटू$############ @damghontu$##############
@सॉलिस$############## @solis$##################
@रोज़ा$############## @rosa$###################
@एक्सिसटेंस$######### @existence$##############
@प्रदेशवाशियों$###### @pradeshvashiyon$########
@प्राना$############# @prana$##################
@नईदिल्ली$########### @naeedillee$#############
@याचिकाकर्ता$######## @yachikakarta$###########
@सिरती$############## @sirtee$#################
@नरसीपुरा$########### @naraseepura$############
@मैक्सियों$########## @maxiyon$################
@नागमण्डल$########### @nagmandal$##############
@रैलीः$############## @railih$#################
@हामिदन$############# @hamidan$################
@बज$################# @budge$##################
@ऑक्टा$############## @octa$############

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import math


# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=1000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=0.1)
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

# class TransformerTextToText(nn.Module):
#     def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers):
#         super(TransformerTextToText, self).__init__()
#         self.embedding = nn.Embedding(input_vocab_size, d_model)
#         self.positional_encoding = PositionalEncoding(d_model)

#         self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers)

#         self.decoder_embedding = nn.Embedding(output_vocab_size, d_model)
#         self.decoder_positional_encoding = PositionalEncoding(d_model)

#         self.fc = nn.Linear(d_model, output_vocab_size)

#     def forward(self, src, tgt):
#         src = self.embedding(src)
#         src = self.positional_encoding(src)

#         tgt = self.decoder_embedding(tgt)
#         tgt = self.decoder_positional_encoding(tgt)

#         output = self.transformer(src, tgt)
#         output = self.fc(output)

#         return output

In [None]:
# from tqdm import tqdm
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# # Define hyperparameters
# input_vocab_size = len(stoi_input)
# output_vocab_size = len(stoi_output)
# d_model = 32
# nhead = 4
# num_encoder_layers = 1
# num_decoder_layers = 1
# model = TransformerTextToText(input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# num_epochs = 1

# # Inside the training loop
# def train_batch(input_tensor, target_tensor):
#     optimizer.zero_grad()

#     # Generate mask for the padded tokens
#     src_key_padding_mask = (input_tensor == 63)  # Assuming 63 is the index of the padding token

#     # Apply padding mask to the input_tensor
#     input_tensor = input_tensor.masked_fill(src_key_padding_mask, 0)

#     # Generate mask for the padded tokens in the decoder
#     tgt_key_padding_mask = (target_tensor == 26)  # Assuming 26 is the index of the padding token

#     # Forward pass
#     output = model(input_tensor, target_tensor, tgt_key_padding_mask=tgt_key_padding_mask)

#     output = output.view(-1, output_vocab_size)
#     target_tensor = target_tensor.view(-1)
#     loss = criterion(output, target_tensor)
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# # Example training loop
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     for batch_input, batch_target in train_dataloader:
#         batch_input = batch_input.to(device)
#         batch_target = batch_target.to(device)

#         loss = train_batch(batch_input, batch_target)
#         total_loss += loss

#     avg_loss = total_loss / len(train_dataloader)
#     print(f"Epoch [{epoch+1}/{num_epochs}] - Avg. Loss: {avg_loss:.4f}")

In [None]:
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(stoi_input)
TGT_VOCAB_SIZE = len(stoi_output)
EMB_SIZE = 128
NHEAD = 4
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



from torch.nn.utils.rnn import pad_sequence

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
      src_batch.append(torch.tensor(encode_input(src_sample)))
      tgt_batch.append(torch.tensor(encode_output(tgt_sample)))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch.T, tgt_batch.T
# batch = [(i, j) for i, j in zip(train_input_data[:10], train_target_data[:10])]
# batch
# collate_fn(batch)

from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, input_data, target_data):
        self.input_data = input_data
        self.target_data = target_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.target_data[idx]

train_dataset = MyDataset(train_input_data, train_target_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


In [None]:
batch = [(i, j) for i, j in zip(train_input_data[:10], train_target_data[:10])]
batch
collate_fn(batch)


(tensor([[ 1, 66, 58, 34, 38, 32, 63, 59, 62, 23,  7, 39, 38, 34,  2,  0,  0],
         [ 1, 51, 64, 37, 56, 52, 48, 38, 59, 38, 20, 26, 39, 64, 37, 39,  2],
         [ 1, 51, 38, 32, 46,  7, 21, 38,  8, 55, 38, 21,  7, 20, 63, 14,  2],
         [ 1, 51, 38, 32, 46,  7, 20, 54,  5, 38, 46,  7,  2,  0,  0,  0,  0],
         [ 1, 37,  5, 38, 39,  7, 39, 34,  9, 14, 39,  2,  0,  0,  0,  0,  0],
         [ 1, 22, 41,  7, 59, 38, 66, 55,  7, 32, 38, 66, 19, 46, 19,  2,  0],
         [ 1, 17, 21, 38, 32, 38, 30,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 59, 52, 41,  9, 13,  9,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 37, 52, 66,  7, 55,  7, 39, 38, 34, 38, 32,  9,  6,  2,  0,  0],
         [ 1,  6,  7,  8, 19, 59, 19, 51, 54, 32,  8, 19,  2,  0,  0,  0,  0]]),
 tensor([[ 1, 21,  3, 18, 12, 17, 15,  7, 15, 23, 18, 22, 12,  2,  0,  0],
         [ 1, 24, 20, 25,  6, 22,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 24, 17,  3, 12, 18, 16, 25,  3, 19, 16, 18, 

In [None]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer, print_interval=10):
    model.train()
    total_loss = 0
    # train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for step, (src, tgt) in enumerate(train_dataloader,1):
        src = src.transpose(0,1)
        tgt = tgt.transpose(0,1)
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

        if step % print_interval == 0:
            avg_loss = total_loss / step
            print(f"Step [{step}/{len(train_dataloader)}], Train loss: {avg_loss:.3f}")

    return total_loss / len(list(train_dataloader))


# def evaluate(model):
#     model.eval()
#     losses = 0

#     val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
#     val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

#     for src, tgt in val_dataloader:
#         src = src.to(DEVICE)
#         tgt = tgt.to(DEVICE)

#         tgt_input = tgt[:-1, :]

#         src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

#         logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

#         tgt_out = tgt[1:, :]
#         loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
#         losses += loss.item()

#     return losses / len(list(val_dataloader))

from timeit import default_timer as timer
NUM_EPOCHS = 10

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    # val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {train_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


Step [10/40], Train loss: 0.971
Step [20/40], Train loss: 0.982
Step [30/40], Train loss: 0.989
Step [40/40], Train loss: 1.023
Epoch: 1, Train loss: 1.023, Val loss: 1.023, Epoch time = 9.963s
Step [10/40], Train loss: 1.010
Step [20/40], Train loss: 0.999
Step [30/40], Train loss: 0.995


KeyboardInterrupt: ignored

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = torch.tensor(encode_input("@"+src_sentence+"$")).reshape(-1,1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return "".join(decode_output(list(tgt_tokens.cpu().numpy()))).replace("@", "").replace("$", "")

samples = ["ड्रिल","अन्त्यन्त","उलझानों","प्रतिष्ठावाले"]
idx = torch.randint(0,len(samples),(1,)).item()
print(samples[idx])
print(translate(transformer, samples[idx]))

प्रतिष्ठावाले
pratishale


In [None]:
# import torch
# import torch.nn as nn
# from torch import Tensor
# from torch.nn import Transformer
# import math
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
# class PositionalEncoding(nn.Module):
#     def __init__(self,
#                  emb_size: int,
#                  dropout: float,
#                  maxlen: int = 5000):
#         super(PositionalEncoding, self).__init__()
#         den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
#         pos = torch.arange(0, maxlen).reshape(maxlen, 1)
#         pos_embedding = torch.zeros((maxlen, emb_size))
#         pos_embedding[:, 0::2] = torch.sin(pos * den)
#         pos_embedding[:, 1::2] = torch.cos(pos * den)
#         pos_embedding = pos_embedding.unsqueeze(-2)

#         self.dropout = nn.Dropout(dropout)
#         self.register_buffer('pos_embedding', pos_embedding)

#     def forward(self, token_embedding: Tensor):
#         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# # helper Module to convert tensor of input indices into corresponding tensor of token embeddings
# class TokenEmbedding(nn.Module):
#     def __init__(self, vocab_size: int, emb_size):
#         super(TokenEmbedding, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, emb_size)
#         self.emb_size = emb_size

#     def forward(self, tokens: Tensor):
#         return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# class Seq2SeqTransformer(nn.Module):

#     def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, max_seq_length, src_pad_token_id, tgt_pad_token_id):
#         super(Seq2SeqTransformer, self).__init__()

#         self.embedding_src = nn.Embedding(input_vocab_size, d_model, padding_idx = src_pad_token_id)
#         self.embedding_tgt = nn.Embedding(output_vocab_size, d_model)
#         self.transformer = nn.Transformer(
#             d_model=d_model,
#             nhead=nhead,
#             num_encoder_layers=num_encoder_layers,
#             num_decoder_layers=num_decoder_layers
#         )
#         self.fc = nn.Linear(d_model, output_vocab_size)
#         self.max_seq_length = max_seq_length
#         self.src_pad_token_id = src_pad_token_id
#         self.tgt_pad_token_id = tgt_pad_token_id

#     def forward(self, src, tgt):
#         src_padding_mask = self.generate_padding_mask(src, self.src_pad_token_id)
#         tgt_padding_mask = self.generate_padding_mask(tgt, self.tgt_pad_token_id)

#         src_embedding = self.embedding_src(src)
#         tgt_embedding = self.embedding_tgt(tgt)

#         # Apply the transformer model
#         # output = self.transformer(src_embedding.transpose(0, 1), tgt_embedding.transpose(0, 1))
#         # output = self.transformer(src_embedding.transpose(0, 1), tgt_embedding.transpose(0, 1),
#         #                           src_mask=src_mask, tgt_mask=tgt_mask)
#         output = self.transformer(src_embedding.transpose(0, 1), tgt_embedding.transpose(0, 1),
#                                 src_mask=None, tgt_mask=None,
#                                 src_key_padding_mask=src_padding_mask, tgt_key_padding_mask=tgt_padding_mask)

#         # Convert the output to logits
#         output_logits = self.fc(output)

#         return output_logits

#     # def generate_padding_mask(self, src):
#     #     # return (src == self.src_pad_token_id).transpose(0, 1)
#     #       return (src == self.src_pad_token_id).unsqueeze(1).unsqueeze(2)
#     def generate_padding_mask(self, seq, pad_token_id):
#         return (seq == pad_token_id)

#     # def generate_square_subsequent_mask(self, sz):
#     #     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
#     #     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
#     #     return mask

# # Example parameters
# input_vocab_size = len(stoi_input)  # Size of the input vocabulary
# output_vocab_size = len(stoi_output)  # Size of the output vocabulary
# max_seq_length = 50       # Maximum sequence length
# d_model = 128             # Dimension of the model
# nhead = 4                 # Number of attention heads
# num_encoder_layers = 1    # Number of encoder layers
# num_decoder_layers = 1    # Number of decoder layers
# src_pad_token_id = input_pad_idx
# tgt_pad_token_id = output_pad_idx
# num_epochs = 1

# # Create the model
# model = Seq2SeqTransformer(input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, max_seq_length, src_pad_token_id, tgt_pad_token_id)

# # Sample input and output sequences
# src_sequence = torch.randint(0, input_vocab_size, (10, max_seq_length))  # Batch size of 10
# tgt_sequence = torch.randint(0, output_vocab_size, (10, max_seq_length + 8))  # Batch size of 10, output sequence longer than input sequence

# print(src_sequence.shape, tgt_sequence.shape)

# # Forward pass
# output_logits = model(src_sequence, tgt_sequence)
# print("Output Logits Shape:", output_logits.shape)

# # Define your optimizer and loss function
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# criterion = nn.CrossEntropyLoss()

# # Training loop (simplified example)
# for epoch in range(num_epochs):
#     for i, batch in enumerate(train_dataloader):  # Iterate over your data batches
#         src_batch, tgt_batch = batch
#         optimizer.zero_grad()

#         # Forward pass
#         output = model(src_batch, tgt_batch[:, :-1]) # model(src_batch, tgt_batch[:, :-1])  # Exclude the last token from target

#         # Calculate loss
#         # print(output.shape, tgt_batch.shape)
#         # print(output.reshape(-1, output_vocab_size).shape)
#         # print(tgt_batch[:, 1:].reshape(-1).shape)
#         loss = criterion(output.reshape(-1, output_vocab_size), tgt_batch[:, 1:].reshape(-1)) # criterion(output.reshape(-1, output_vocab_size), tgt_batch[:, :].reshape(-1))  # Exclude the first token from target
#         print(f"step: {i}, loss: {loss.item():.2f}")
#         # Backpropagation and optimization
#         loss.backward()
#         optimizer.step()

torch.Size([10, 50]) torch.Size([10, 58])
Output Logits Shape: torch.Size([58, 10, 29])
step: 0, loss: 3.62
step: 1, loss: 2.03
step: 2, loss: 2.12
step: 3, loss: 2.11
step: 4, loss: 2.16
step: 5, loss: 2.12
step: 6, loss: 1.96
step: 7, loss: 1.93
step: 8, loss: 1.78
step: 9, loss: 1.75
step: 10, loss: 1.91
step: 11, loss: 2.08
step: 12, loss: 1.99
step: 13, loss: 2.12
step: 14, loss: 2.13
step: 15, loss: 1.83
step: 16, loss: 2.14
step: 17, loss: 2.18
step: 18, loss: 1.81
step: 19, loss: 2.20


In [None]:
# logits = model(src_sequence, tgt_sequence[:, :-1])
# print(logits.reshape(-1, output_vocab_size).shape)
# print(tgt_sequence[:, 1:].reshape(-1).shape)
# # print(logits.reshape(-1, output_vocab_size), tgt_batch[:, 1:].reshape(-1))
# # criterion(logits, tgt_sequence)

torch.Size([570, 27])
torch.Size([570])


In [None]:
# import torch

# def transliterate_word(model, src, max_length=50):
#     model.eval()

#     # Tokenize the source sentence and convert to tensor
#     src_tokens = encode_input(src)
#     src_tensor = torch.tensor(src_tokens).unsqueeze(0)  # Add batch dimension

#     # Initialize the target sentence with the <sos> token
#     tgt = ['@']

#     with torch.no_grad():
#         for _ in range(max_length):
#             # Convert the current target sentence to tensor
#             tgt_tokens = encode_output(tgt)
#             # print(tgt_tokens)
#             tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0)  # Add batch dimension

#             # Generate the next token in the target sequence
#             output = model(src_tensor, tgt_tensor)
#             # print(output)

#             # next_token_id = output.argmax(dim=-1)[:, -1].item()

#             next_token_probs = output[0, -1, :]  # Get the probabilities for the next token
#             print(next_token_probs)
#             next_token_id = next_token_probs.argmax().item()

#             next_token = decode_output([next_token_id])
#             print(next_token)

#             # Stop if <eos> token is generated or if max length is reached
#             if next_token == '$' or len(tgt) >= max_length:
#                 break

#             tgt.append(next_token)

#     # Convert the target sentence back to a string
#     transliterated_word = ' '.join(tgt[1:])  # Exclude <sos> token
#     return transliterated_word

# # Assuming you have trained 'model', 'src_vocab', and 'tgt_vocab'

# sample = "ड्रिल"
# # sample = "उलझानों"
# output = transliterate_word(model, sample)
# print("Source Sentence:", sample)
# print("Translated Sentence:", output)

NameError: ignored

In [None]:
# a = torch.tensor([[[-0.0194,  2.4957,  1.2230, -2.5968, -1.3113, -2.7148,  0.9393,
#            1.2988, -0.5863,  1.1398,  0.6810,  0.1490, -1.8229,  1.2060,
#           -0.3787, -2.8461,  0.0497, -0.4994, -0.5241,  0.9051,  0.9477,
#            1.6556,  0.4972,  0.7228,  0.7183, -0.1150,  4.9022]],

#         [[-0.0225,  2.4873,  1.2296, -2.5983, -1.3164, -2.7160,  0.9366,
#            1.2947, -0.5868,  1.1381,  0.6856,  0.1497, -1.8209,  1.2084,
#           -0.3769, -2.8440,  0.0500, -0.5049, -0.5246,  0.9100,  0.9507,
#            1.6576,  0.4980,  0.7252,  0.7146, -0.1176,  4.8899]]])
# a.argmax(dim=-1)[:, -1]#.item()

torch.tensor([1,2]).unsqueeze(0)  # Add batch dimension


tensor([[1, 2]])

In [None]:
# model

Seq2SeqTransformer(
  (embedding_src): Embedding(70, 64, padding_idx=64)
  (embedding_tgt): Embedding(27, 64)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
   

In [None]:

# for batch_input, batch_target in train_dataloader:
#   print(batch_input)
#   print(batch_target)
#   break

tensor([[14, 55, 17, 24, 20, 55, 24, 20, 47, 48],
        [47, 34,  3, 48, 23, 48, 63, 63, 63, 63],
        [60, 49, 52, 27, 21, 63, 63, 63, 63, 63],
        [62, 17, 55, 11, 18, 20, 47, 27, 63, 63]])
(tensor([10, 25, 23, 20, 16, 25, 25, 23, 18, 25, 25, 14, 13]), tensor([14, 25,  7, 24, 13, 15, 13]), tensor([ 8, 24,  1,  1,  3, 20, 13]), tensor([ 9, 20, 23, 22, 25, 25, 14, 20]))


In [None]:
# import torch
# g = torch.Generator()
# g.manual_seed(123)
# for _ in range(5):
#   print(torch.randint(0,len(x), (2,), generator=g).tolist())

[382, 789]
[102, 610]
[580, 842]
[886, 57]
[699, 754]


In [None]:
# import torch
# g = torch.Generator()
# g.manual_seed(123)
# def get_batch(batch_size):
#   idx = torch.randint(0,len(x), (2,), generator=g).tolist()
#   return x[idx]
# get_batch(5)

TypeError: ignored

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import math


# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=1000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=0.1)
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

# class TransformerTextToText(nn.Module):
#     def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers):
#         super(TransformerTextToText, self).__init__()
#         self.embedding = nn.Embedding(input_vocab_size, d_model)
#         self.positional_encoding = PositionalEncoding(d_model)

#         self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers)

#         self.decoder_embedding = nn.Embedding(output_vocab_size, d_model)
#         self.decoder_positional_encoding = PositionalEncoding(d_model)

#         self.fc = nn.Linear(d_model, output_vocab_size)

#     def forward(self, src, tgt):
#         src = self.embedding(src)
#         src = self.positional_encoding(src)

#         tgt = self.decoder_embedding(tgt)
#         tgt = self.decoder_positional_encoding(tgt)

#         output = self.transformer(src, tgt)
#         output = self.fc(output)

#         return output

# # Define hyperparameters
# input_vocab_size = len(hin_vocab) + len(eng_vocab)
# output_vocab_size = len(eng_vocab)
# d_model = 32
# nhead = 4
# num_encoder_layers = 1
# num_decoder_layers = 1
# model = TransformerTextToText(input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# def train(input_tensor, target_tensor):
#     optimizer.zero_grad()
#     output = model(input_tensor, target_tensor)
#     output = output.view(-1, output_vocab_size)
#     target_tensor = target_tensor.view(-1)
#     loss = criterion(output, target_tensor)
#     loss.backward()
#     optimizer.step()
#     return loss.item()


In [None]:

# train()

In [None]:
# # Example usage
# input_hindi_tensor = encode_sentence("भारत", hindi_char_to_index)
# input_english_tensor = encode_sentence("bharat", english_char_to_index)
# input_tensor = torch.cat((input_hindi_tensor, input_english_tensor), dim=0).unsqueeze(1)

# target_english_tensor = encode_sentence("india", english_char_to_index)  # Only English output
# target_tensor = target_english_tensor.unsqueeze(1)

# loss = train(input_tensor, target_tensor)