In [1]:
from transformer import Transformer # this is the transformer.py file
import torch
import numpy as np
     

In [2]:
english_file = '/teamspace/studios/this_studio/train.en' # replace this path with appropriate one
hindi_file = '/teamspace/studios/this_studio/train.hi' # replace this path with appropriate one

# Generated this by filtering Appendix code
START_TOKEN = '<START>'
PADDING_TOKEN = '<PAD>'
END_TOKEN = '<END>'


hindi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ', 
                    'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ', 
                    'क', 'ख', 'ग', 'घ', 'ङ', 
                    'च', 'छ', 'ज', 'झ', 'ञ', 
                    'ट', 'ठ', 'ड', 'ढ', 'ण', 
                    'त', 'थ', 'द', 'ध', 'न', 
                    'प', 'फ', 'ब', 'भ', 'म', 
                    'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', 
                    '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', '्', 'ॎ', 'ॏ', 'ॐ', 
                    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN]
english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                      ':', '<', '=', '>', '?', '@',
                      '[', '\\', ']', '^', '_', '`', 
                      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                      'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                      'y', 'z', 
                      '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]


In [3]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [4]:
print(f"Index for START_TOKEN in Hindi vocabulary: {hindi_to_index[START_TOKEN]}")
print(f"Index for END_TOKEN in Hindi vocabulary: {hindi_to_index[END_TOKEN]}")
print(f"Index for PADDING_TOKEN in Hindi vocabulary: {hindi_to_index[PADDING_TOKEN]}")

print(f"Index for START_TOKEN in English vocabulary: {english_to_index[START_TOKEN]}")
print(f"Index for END_TOKEN in English vocabulary: {english_to_index[END_TOKEN]}")
print(f"Index for PADDING_TOKEN in English vocabulary: {english_to_index[PADDING_TOKEN]}")


Index for START_TOKEN in Hindi vocabulary: 0
Index for END_TOKEN in Hindi vocabulary: 118
Index for PADDING_TOKEN in Hindi vocabulary: 117
Index for START_TOKEN in English vocabulary: 0
Index for END_TOKEN in English vocabulary: 70
Index for PADDING_TOKEN in English vocabulary: 69


In [5]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(hindi_file, 'r') as file:
    hindi_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
hindi_sentences = hindi_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
hindi_sentences = [sentence.rstrip('\n') for sentence in hindi_sentences]

In [6]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length hindi: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

     

97th percentile length hindi: 258.0
97th percentile length English: 267.0


In [7]:
max_sequence_length = 500

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 200000
Number of valid sentences: 54716


In [8]:
hindi_sentences = [hindi_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [9]:
len(hindi_sentences),len(english_sentences)

(54716, 54716)

In [10]:
import torch

d_model = 512
batch_size = 30
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 500
hi_vocab_size = len(hindi_vocabulary)

transformer = Transformer(d_model, 
                          ffn_hidden,
                          num_heads, 
                          drop_prob, 
                          num_layers, 
                          max_sequence_length,
                          hi_vocab_size,
                          english_to_index,
                          hindi_to_index,
                          START_TOKEN, 
                          END_TOKEN, 
                          PADDING_TOKEN)

In [11]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(71, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding):

In [12]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]

In [13]:
dataset = TextDataset(english_sentences, hindi_sentences)

In [14]:
len(dataset)

54716

In [15]:
dataset[1]

('mithali to anchor indian team against australia in odis',
 'आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को')

In [16]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)


In [17]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('whosoever desires the reward of the world, with allah is the reward of the world and of the everlasting life. allah is the hearer, the seer.', 'mithali to anchor indian team against australia in odis', 'the court has fixed a hearing for february 12', 'please select the position where the track should be split.', 'jharkhand chief minister hemant soren', '"jesus responded, as he taught in the temple, ""how is it that the scribes say that the christ is the son of david?"', 'he does this because he is angry at the alleged desecration of the indian flag', 'share videos', '"""wunderlich and his wife, petra, are both gardeners, who themselves graduated from a normal high school, which they attended """"not reluctantly,"""" dirk said."""', 'hence, we must make sure we ourselves dont become careless, not allow anyone else do so', 'mayawati, akhilesh yadav seal pact for 2019 polls: bsp-sp not to contest in amethi, raebareli', 'they praise night and day, without ever tiring.', 'im fine by the 

In [18]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [19]:
device

device(type='cuda')

In [20]:
NEG_INFTY = -1e9

def create_masks(eng_batch, hi_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, hi_sentence_length = len(eng_batch[idx]), len(hi_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      hi_chars_to_padding_mask = np.arange(hi_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, hi_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, hi_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, hi_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask.to(device), decoder_self_attention_mask.to(device), decoder_cross_attention_mask.to(device)

In [21]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, hi_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hi_batch)
        #encoder_self_attention_mask = encoder_self_attention_mask.to(device)
        #decoder_self_attention_mask = decoder_self_attention_mask.to(device)
        #decoder_cross_attention_mask = decoder_cross_attention_mask.to(device)

        optim.zero_grad()
        hi_predictions = transformer(eng_batch,
                                     hi_batch,
                                     encoder_self_attention_mask.to(device), 
                                     decoder_self_attention_mask.to(device), 
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(hi_batch, start_token=False, end_token=True)
        loss = criterian(
            hi_predictions.view(-1, hi_vocab_size).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Hindi Translation: {hi_batch[0]}")
            hi_sentence_predicted = torch.argmax(hi_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in hi_sentence_predicted:
              if idx == hindi_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_hindi[idx.item()]
            print(f"Hindi Prediction: {predicted_sentence}")


            transformer.eval()
            hi_sentence = ("",)
            eng_sentence = ("should we go to the mall?",)
            for word_counter in range(max_sequence_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, hi_sentence)
                
                predictions = transformer(eng_sentence,
                                          hi_sentence,
                                          encoder_self_attention_mask.to(device), 
                                          decoder_self_attention_mask.to(device), 
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_hindi[next_token_index]
                hi_sentence = (hi_sentence[0] + next_token, )
                if next_token == END_TOKEN:
                  break
            
            print(f"Evaluation translation (should we go to the mall?) : {hi_sentence}")
            print("-------------------------------------------")

Epoch 0


Iteration 0 : 5.406650066375732
English: whosoever desires the reward of the world, with allah is the reward of the world and of the everlasting life. allah is the hearer, the seer.
Hindi Translation: और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है
Hindi Prediction: औऎाऌऌ८५५न०नमणमॊॊऔमऔ?ॊ?ॊॊन0४ओन?0?औऔ?औऔ४४४(नऔऔऔऔघॊ५५घॊ?औ?औऔ??ॊघॊऔ:औ:( ॊॅ४५:५ॊॊ८८?न???५?५५औॅ५ॅ५५न५औऌऌन?न५5<औ5नन55+++ॊ४:ॅॊ:5५ँॐ५५ख5ॐ50ॅ555+ॅ<ई<<555५ॅ५ॅॆमॊ:ॅॅ४5+%%%:4औ+ॊऔॊ%4घॊॊ?<ॊॅॅ5ॅॅ४:ॅॊॅ:ॊॅॊॊ?????((१ख5?खख?%5न+?घ?औऎॊऔ(घ(((((((घऎआ(ॊऽऽॊ?४ॊ5ॊॊ?ॊ%ई%(ॊघॊ((घऎ((ॊ(ॆॊ(ॊ(:(ख<खॊ::<<::ॐ:<<:ॊ(??ईतऎईईॊ5((::ई<(ॊ७घघ(((((+नसऎ(घ(((घ:घ(घॐ(घ((((((+(((((न((%५((((+(++::+::(स+(ई(::ॊ=ण:घ::::(::घ::::::::5घ?5घ(५५+55घ+?५५घ+५<+::५घ५नघ५घघघन::५५५घ?५::?५५ॐननन:ॐ५:५५५ॊननॊन५ैननन५नघ५५:घैन७५ै५५५५५:<४ै:ै<ै(ॐ७:::::५५५सॐससॊ::घ::थथैॊॊस५:
Evaluation translation (should we go to the mall?) : ('                                                                      

In [22]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  hi_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, hi_sentence)
    predictions = transformer(eng_sentence,
                              hi_sentence,
                              encoder_self_attention_mask.to(device), 
                              decoder_self_attention_mask.to(device), 
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_hindi[next_token_index]
    hi_sentence = (hi_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return hi_sentence[0]

In [23]:
translation = translate("what should we do when the day starts?")
print(translation)

क्या हम क्या है कि क्या है?<END>


In [25]:
translation = translate("the world is a large place with different people")
print(translation)

यह भी का कारण के लिए से पहले के लिए हैं<END>


In [26]:
translation = translate("my name is kartik")
print(translation)

मुझे में कहा कि मुख्यमंत्री<END>


In [27]:
translation = translate("i cannot stand this smell")
print(translation)

मैं में से कारण करने की मौत<END>


In [28]:
translation = translate("this is the best thing ever")
print(translation)

यह भी कोई संख्या है<END>


In [29]:
translation = translate("i am here")
print(translation)

मैं किया है<END>


In [30]:
translation = translate("how are you")
print(translation)

क्यों कोई करना चाहिए<END>


In [37]:
transformer.eval()

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(71, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding):

In [38]:
import torch

# Save the model's state dictionary
torch.save(transformer.state_dict(), 'TransformerTranslationModel.pth')


In [39]:
import json

# Define the model configuration
model_config = {
    "d_model": 512,
    "batch_size": 30,
    "ffn_hidden": 2048,
    "num_heads": 8,
    "drop_prob": 0.1,
    "num_layers": 1,
    "max_sequence_length": 500,
    "vocab_size_hindi": 119,  # Update based on your actual vocabulary size
    "vocab_size_english": 71  # Update based on your actual vocabulary size
}

# Save the configuration to a JSON file
with open('model_config.json', 'w') as f:
    json.dump(model_config, f, indent=4)


In [43]:
from huggingface_hub import HfApi

# Set your Hugging Face token
token = 'hf_'

# Initialize API with token
api = HfApi(token=token)


In [47]:
from huggingface_hub import HfApi, Repository
import shutil
import os

# Define your repository ID (replace with your username/repo_name)
repo_id = 'Kartik12/TransformerTranslationModel'


# Initialize API with token
api = HfApi(token=token)

# Create a new repository if it does not exist
api.create_repo(repo_id, repo_type='model')

# Clone the repository locally
repo = Repository(local_dir='my_model_repo', clone_from=repo_id, token=token)

# Copy the saved model and config files to the repository directory
shutil.copy('TransformerTranslationModel.pth', 'my_model_repo/')
shutil.copy('model_config.json', 'my_model_repo/')

# Change working directory to the repository directory
os.chdir('my_model_repo')

# Add files to the repository
repo.git_add()

# Commit files
repo.git_commit('Add model and configuration files')

# Push files to Hugging Face Hub
repo.git_push()


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Kartik12/TransformerTranslationModel into local empty directory.


Upload file TransformerTranslationModel.pth:   0%|          | 1.00/28.7M [00:00<?, ?B/s]

To https://huggingface.co/Kartik12/TransformerTranslationModel
   b0f1f6d..2a5a98b  main -> main



'https://huggingface.co/Kartik12/TransformerTranslationModel/commit/2a5a98b2a29f2b6254d9ca9080efc4ca1e907c43'