In [1]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='tamil_sindhi_cleaned.tsv', delimiter='\t', column_names=['source', 'target'])


In [2]:
from transformers import MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
tokenizer.src_lang = 'ta_IN'  # Tamil
tokenizer.tgt_lang = 'sd_PK'  # Sindhi


In [3]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['source']]
    targets = [ex for ex in examples['target']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [4]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')


2025-03-29 13:24:27.586271: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-29 13:24:27.586314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-29 13:24:27.587370: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-29 13:24:27.593230: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,  # Reduce batch size
    per_device_eval_batch_size=2,
    num_train_epochs=1,  # Train for only 1 epoch
    max_steps=50,  # Train for only 50 steps (quicker training)
    save_steps=10,  # Save checkpoints every 10 steps
    logging_steps=5,  # Log progress every 5 steps
    fp16=True,  # Faster training on GPU
)




In [7]:
from datasets import load_dataset, DatasetDict

# Load the dataset
dataset = load_dataset('csv', data_files='tamil_sindhi_cleaned.tsv', delimiter='\t', column_names=['source', 'target'])

# Split dataset into 90% training and 10% testing
dataset = dataset['train'].train_test_split(test_size=0.1)

# Check available splits
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1672
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 186
    })
})


In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Map:   0%|          | 0/186 [00:00<?, ? examples/s]

In [9]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['source']]
    targets = [ex for ex in examples['target']]
    
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Map:   0%|          | 0/186 [00:00<?, ? examples/s]

In [10]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)


  trainer = Seq2SeqTrainer(


In [5]:
small_dataset = dataset['train'].select(range(100))  # Use only 100 samples
tokenized_dataset = small_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]



In [11]:
trainer.train()

Step,Training Loss
5,10.696
10,9.4424
15,8.6602
20,7.8968
25,7.0112
30,6.5002
35,5.7797
40,5.9308
45,5.1341
50,5.1685




TrainOutput(global_step=50, training_loss=7.221979103088379, metrics={'train_runtime': 828.8413, 'train_samples_per_second': 0.121, 'train_steps_per_second': 0.06, 'total_flos': 27089122099200.0, 'train_loss': 7.221979103088379, 'epoch': 0.05980861244019139})

In [12]:
model.eval()
sample = "உங்கள் தமிழ் உரை இங்கே"  # Replace with your Tamil text
inputs = tokenizer(sample, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print(translation)


Your Tamil Speech Here


In [13]:
model.eval()
sample = "உங்கள் தமிழ் உரை இங்கே"  # Replace with your Tamil text

# Tokenize the input
inputs = tokenizer(sample, return_tensors="pt")

# Set target language to Sindhi (sd_PK) if using mBART
tokenizer.tgt_lang = "sd_PK"

# Generate translation
translated_tokens = model.generate(**inputs)
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

# Print the translated Sindhi text
print("Sindhi Translation:", translation)


Sindhi Translation: Your Tamil Speech Here


In [14]:
model.eval()
sample = "உங்கள் தமிழ் உரை இங்கே"  # Replace with your Tamil text

# Tokenize the input
inputs = tokenizer(sample, return_tensors="pt")

# Ensure target language is Sindhi
tokenizer.tgt_lang = "sd_PK"

# Force the model to output Sindhi
forced_decoder_ids = tokenizer.convert_tokens_to_ids(["sd_PK"])
translated_tokens = model.generate(**inputs, forced_bos_token_id=forced_decoder_ids[0])

# Decode and print translation
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print("Sindhi Translation:", translation)


Sindhi Translation: اُن تُن تُن تُن تُن اُن


In [15]:
print(tokenizer.lang_code_to_id)


{'ar_AR': 250001, 'cs_CZ': 250002, 'de_DE': 250003, 'en_XX': 250004, 'es_XX': 250005, 'et_EE': 250006, 'fi_FI': 250007, 'fr_XX': 250008, 'gu_IN': 250009, 'hi_IN': 250010, 'it_IT': 250011, 'ja_XX': 250012, 'kk_KZ': 250013, 'ko_KR': 250014, 'lt_LT': 250015, 'lv_LV': 250016, 'my_MM': 250017, 'ne_NP': 250018, 'nl_XX': 250019, 'ro_RO': 250020, 'ru_RU': 250021, 'si_LK': 250022, 'tr_TR': 250023, 'vi_VN': 250024, 'zh_CN': 250025, 'af_ZA': 250026, 'az_AZ': 250027, 'bn_IN': 250028, 'fa_IR': 250029, 'he_IL': 250030, 'hr_HR': 250031, 'id_ID': 250032, 'ka_GE': 250033, 'km_KH': 250034, 'mk_MK': 250035, 'ml_IN': 250036, 'mn_MN': 250037, 'mr_IN': 250038, 'pl_PL': 250039, 'ps_AF': 250040, 'pt_XX': 250041, 'sv_SE': 250042, 'sw_KE': 250043, 'ta_IN': 250044, 'te_IN': 250045, 'th_TH': 250046, 'tl_XX': 250047, 'uk_UA': 250048, 'ur_PK': 250049, 'xh_ZA': 250050, 'gl_ES': 250051, 'sl_SI': 250052}


In [21]:
import evaluate

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=[translation], references=[reference_translation])
print(results)


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

NameError: name 'reference_translation' is not defined

In [22]:
import evaluate

bleu = evaluate.load("bleu")

# Example Tamil text and expected Sindhi translation
translation = "اُن تُن تُن تُن تُن اُن" # Replace with your model output
reference_translation = ["توهان ڪيئن آهيو؟"]  # Expected Sindhi translation

results = bleu.compute(predictions=[translation], references=[reference_translation])
print(results)


{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.0, 'translation_length': 6, 'reference_length': 3}


In [23]:
import evaluate

bleu = evaluate.load("bleu")

# Example Tamil text and expected Sindhi translation
translation = "توهان ڪيئن آهيو؟"  # Replace with your model output
reference_translation = ["توهان ڪيئن آهيو؟"]  # Expected Sindhi translation

results = bleu.compute(predictions=[translation], references=[reference_translation])
print(results)


{'bleu': 0.0, 'precisions': [1.0, 1.0, 1.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 3, 'reference_length': 3}
