In [3]:
from transformer import Transformer
import torch
import numpy as np
import pandas as pd

In [4]:
file_name="./cleaned.csv"

In [5]:
df_eng_mar=pd.read_csv(file_name)

In [6]:
df_eng_mar.head()

Unnamed: 0,English,Marathi
0,go,जा
1,run,पळ
2,run,धाव
3,run,पळा
4,run,धावा


In [7]:
START_TOKEN = ''
PADDING_TOKEN = ''
END_TOKEN = ''

marathi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
                      'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ',
                      'क', 'ख', 'ग', 'घ', 'ङ',
                      'च', 'छ', 'ज', 'झ', 'ञ',
                      'ट', 'ठ', 'ड', 'ढ', 'ण',
                      'त', 'थ', 'द', 'ध', 'न',
                      'प', 'फ', 'ब', 'भ', 'म',
                      'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह',
                      'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'े', 'ै', 'ो', 'ौ', '्', 'य़', 'ॠ', 'ॡ', 'ं', 'ः',
                      '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN]

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

In [8]:
index_to_marathi_word = {index: word for index, word in enumerate(marathi_vocabulary)}
index_to_english_word = {index: word for index, word in enumerate(english_vocabulary)}
marathi_to_index = {word: index for index, word in enumerate(marathi_vocabulary)}
english_to_index = {word: index for index, word in enumerate(english_vocabulary)}
#

In [9]:
english_sentences = df_eng_mar['English'].values.tolist()
marathi_sentences = df_eng_mar['Marathi'].values.tolist()

In [10]:
english_sentences[:3],marathi_sentences[:3]

(['go', 'run', 'run'], ['जा', 'पळ', 'धाव'])

In [11]:
np.max([len(x) for x in marathi_sentences])

183

In [12]:
np.max([len(x) for x in english_sentences])

176

In [13]:
max_sequence_length=200

def is_valid_tokens(sentence, vocabulary):
    for token in list(set(sentence)):
        if token not in vocabulary:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length-1)


valid_sentence_indices = []

for index in range(len(marathi_sentences)):
    marathi_sentence = marathi_sentences[index]
    english_sentence = english_sentences[index]
    if is_valid_length(marathi_sentence, max_sequence_length) \
    and is_valid_length(english_sentence, max_sequence_length) \
    and is_valid_tokens(marathi_sentence, marathi_vocabulary) :
        valid_sentence_indices.append(index)

print("Number of sentences : ",len(marathi_sentences))
print("Number of valid sentences : ",len(valid_sentence_indices))

Number of sentences :  41028
Number of valid sentences :  30226


In [14]:
marathi_sentences = [marathi_sentences[index] for index in valid_sentence_indices]
english_sentences = [english_sentences[index] for index in valid_sentence_indices]

In [15]:
marathi_sentences[-3:],english_sentences[-3:]

(['जेव्हा मी बनवलेला तेम्पुरा थंड होतो तेव्हा त्याचा कुरकुरीतपणा लगेच जातो आणि मग त्याची चव तेवढी चांगली लागत नाही',
  'जर धर्म व नीतिमत्ता समानार्थी शब्द असते तर ब्राजील जगातला सर्वात अभ्रष्ट देश असता',
  'हड्डींमुळे मासे आवडत नाही असं म्हणणं हे काय मासे नावडण्यासाठी चांगलं कारण नाहीये'],
 ['when the tempura i make cools down it immediately loses its crispiness and does not taste very good',
  'if religion were synonymous with morality brazil would be the most uncorrupted country in the world',
  'just saying you do not like fish because of the bones is not really a good reason for not liking fish'])

In [16]:
import random
list1 = marathi_sentences
list2 = english_sentences
# Step 1: Combine the lists into a list of tuples
combined = list(zip(list1, list2))

# Step 2: Shuffle the combined list
random.shuffle(combined)

# Step 3: Unzip the shuffled list into two lists
shuffled_list1, shuffled_list2 = zip(*combined)

# Convert the tuples back to lists if needed
shuffled_list1 = list(shuffled_list1)
shuffled_list2 = list(shuffled_list2)

print("Shuffled List 1:", shuffled_list1[-3:])
print("Shuffled List 2:", shuffled_list2[-3:])


Shuffled List 1: ['ती कोण आहे', 'तू फ्रेंच बोलतेस का', 'मी त्यालाच फोन केलेला']
Shuffled List 2: ['who is she', 'do you speak french', 'he is the one i called']


In [17]:
english_sentences=shuffled_list2
marathi_sentences=shuffled_list1

In [18]:
d_model=512
batch_size=30
ffn_hidden=2048
num_heads=8
num_layers=6
dropout=0.1
max_sequence_length=200
marathi_vocabulary_size=len(marathi_vocabulary)
english_vocabulary_size=len(english_vocabulary)


In [19]:
transformer=Transformer(d_model,
                        ffn_hidden,
                        num_heads
                        ,dropout,
                        num_layers,
                        max_sequence_length,
                        marathi_vocabulary_size,
                        english_to_index,
                        marathi_to_index,
                        START_TOKEN,
                        END_TOKEN,
                        PADDING_TOKEN)

In [20]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(69, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [21]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
  def __init__(self, english_sentences, marathi_sentences):
    self.english_sentences = english_sentences
    self.marathi_sentences = marathi_sentences
  def __len__(self):
    return len(self.english_sentences)

  def __getitem__(self, idx):
    english_sentence = self.english_sentences[idx]
    marathi_sentence = self.marathi_sentences[idx]
    return english_sentence, marathi_sentence


In [22]:
dataset=TextDataset(english_sentences,marathi_sentences)

In [23]:
len(dataset)

30226

In [24]:
dataset[1]

('take my name off the list', 'माझं नाव यादीवरून काढून टाक')

In [25]:
train_loader = DataLoader(dataset, batch_size=batch_size)

In [26]:
iterator=iter(train_loader)

In [27]:
for batch_num,batch in enumerate(iterator):
    print(batch)
    if batch_num>2:
        break

[('i am getting tired of losing', 'take my name off the list', 'do not talk to me', 'who is the manager', 'what time do you go to bed', 'has anybody come', 'i just do not understand', 'this makes me very angry', 'law and politics are two different things', 'some of the dogs are alive', 'i was not mad at you', 'she committed suicide by taking poison', 'what comes first', 'what did bell invent', 'ah how beautiful the taj mahal is', 'we drove them out', 'i am eating noodles', 'have you ever been to mexico', 'how does your brother', 'criminals should be punished', 'they got married', 'those are my books', 'what are you playing', 'the meeting ended at  pm', 'they gave it to me', 'give me another example', 'i have been married three times', 'is it dangerous', 'it burned', 'the bridge was built by the romans'), ('मला सतत हरण्याचा कंटाळा येतोय', 'माझं नाव यादीवरून काढून टाक', 'माझ्याशी बोलू नका', 'व्यवस्थापक कोण आहे', 'किती वाजता झोपायला जातोस', 'कोणी आलं आहे का', 'मला तर कळतच नाहीये', 'याने म

In [28]:
from torch import nn
criterian=nn.CrossEntropyLoss(ignore_index=marathi_to_index[PADDING_TOKEN],reduction='none')

In [29]:
for params in transformer.parameters():
    if params.dim()>1:
      nn.init.xavier_uniform_(params)
optim=torch.optim.Adam(transformer.parameters(),lr=1e-4)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
NEG_INFTY=-1e9

def create_masks(eng_batch,mar_batch):
    num_senrences=len(eng_batch)
    look_ahead_mask=torch.full([max_sequence_length,max_sequence_length],True)
    look_ahead_mask=torch.triu(look_ahead_mask,diagonal=1)
    encoder_padding_mask=torch.full([num_senrences,max_sequence_length,max_sequence_length],False)
    decoder_padding_mask_self_attention=torch.full([num_senrences,max_sequence_length,max_sequence_length],False)
    decoder_padding_mask_cross_attention=torch.full([num_senrences,max_sequence_length,max_sequence_length],False)
    for idx in range(num_senrences):
      eng_sentence_length, mar_sentence_length = len(eng_batch[idx]), len(mar_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      mar_chars_to_padding_mask = np.arange(mar_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, mar_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, mar_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, mar_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask


In [25]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10


for epoch in range(num_epochs):
  print("Epoch: ",epoch)
  for batch_num,batch in enumerate(train_loader):
    eng_batch,mar_batch=batch
    transformer.train()
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch,mar_batch)
    optim.zero_grad()
    mar_preds=transformer(eng_batch,mar_batch,
                          encoder_self_attention_mask.to(device),\
                          decoder_self_attention_mask.to(device),\
                          decoder_cross_attention_mask.to(device),\
                          enc_start_token=False,\
                          enc_end_token=False,\
                          dec_start_token=True,\
                          dec_end_token=True)
    labels=transformer.decoder.sentence_embedding.batch_tokenize(mar_batch,start_token=False,end_token=True)
    loss=criterian(mar_preds.view(-1,marathi_vocabulary_size).to(device),
                   labels.view(-1).to(device)).to(device)
    valid_indicies=torch.where(labels.view(-1)==marathi_to_index[PADDING_TOKEN],False,True)

    loss = loss.sum() / valid_indicies.sum()
    loss.backward()
    optim.step()

    if batch_num % 100 == 0:
      print(f"Iteration {batch_num} : {loss.item()}")
      print(f"English: {eng_batch[0]}")
      print(f"Kannada Translation: {mar_batch[0]}")
      mar_sentence_predicted = torch.argmax(mar_preds[0], axis=1)
      predicted_sentence = ""
      for idx in mar_sentence_predicted:
        if idx == marathi_to_index[END_TOKEN]:
          break
        predicted_sentence += index_to_marathi_word[idx.item()]
      print(f"Kannada Prediction: {predicted_sentence}")


      transformer.eval()
      mar_sentence = ("",)
      eng_sentence = ("should we go to the mall?",)
      for word_counter in range(max_sequence_length):
          encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, mar_sentence)
          predictions = transformer(eng_sentence,
                                    mar_sentence,
                                    encoder_self_attention_mask.to(device),
                                    decoder_self_attention_mask.to(device),
                                    decoder_cross_attention_mask.to(device),
                                    enc_start_token=False,
                                    enc_end_token=False,
                                    dec_start_token=True,
                                    dec_end_token=False)
          next_token_prob_distribution = predictions[0][word_counter] # not actual probs
          next_token_index = torch.argmax(next_token_prob_distribution).item()
          next_token = index_to_marathi_word[next_token_index]
          kn_sentence = (mar_sentence[0] + next_token, )
          if next_token == END_TOKEN:
            break

      print(f"Evaluation translation (should we go to the mall?) : {kn_sentence}")
      print("-------------------------------------------")

Epoch:  0


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
