In [1]:
!pip install sacremoses
!pip install nltk



In [2]:
import pandas as pd
import numpy as np
import os


#for models and tokenizers
from transformers import NllbTokenizer
from transformers import AutoModelForSeq2SeqLM
from tqdm.auto import tqdm, trange

#for preprocessing
import re
import sys
import typing as tp
import unicodedata
from sacremoses import MosesPunctNormalizer

#for training
import gc
import random
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import AdamW
from transformers import get_constant_schedule_with_warmup

#matric calculation
import nltk.translate.bleu_score as bleu
from datasets import load_metric

In [3]:
def load_data(eng_path,tel_path,size=50000):
    #loading the data

    with open(eng_path , 'r', encoding='utf-8') as file:
        english_sentences = file.readlines()

    with open(tel_path , 'r', encoding='utf-8') as file:
        telugu_sentences = file.readlines()

    # Create a dataframe
    df = pd.DataFrame({
        'eng': english_sentences,
        'tel': telugu_sentences
    })
    
    #random sampling  data 
    len(df['eng']), len(df['tel'])
    df_sub = df.sample(n=size, random_state=42)
    
    return df_sub    

In [4]:
def split_data(df_sub,train_size=0.8,dev_size=0.0,test_size=0.2):
    # Calculate the sizes of each split
    total_size = len(df_sub)
    train_size = int(train_size* total_size)
    dev_size = int(dev_size* total_size)

    # Split the dataframe
    train_df = df_sub[:train_size]
    dev_df = df_sub[train_size:train_size + dev_size]
    test_df = df_sub[train_size + dev_size  :]

    # Verify the shapes of the resulting dataframes
    print("Train set shape:", train_df.shape)
    print("Development set shape:", dev_df.shape)
    print("Test set shape:", test_df.shape)
    
    return train_df,dev_df,test_df

In [5]:
def load_model_and_tokinizer():
    #loading NLLB predefined model and tokinizeer
    tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
    model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')
    return model,tokenizer

In [6]:
df_sub = load_data('nlp/train.en','nlp/train.te',60000)
train_df, dev_df, test_df = split_data(df_sub, 0.8, 0, 0.2)
model,tokenizer = load_model_and_tokinizer()

Train set shape: (48000, 2)
Development set shape: (0, 2)
Test set shape: (12000, 2)


In [7]:
'''Code for cleaning telugu text'''
# this code is adapted from  the Stopes repo of the NLLB team
# https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]


def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [8]:
def get_batch_pairs(batch_size=8, data=train_df):
    '''Get a batch of sentence pairs from the data.'''
    (l1,long1) =('eng', 'eng_Latn')
    (l2, long2) = ('tel', 'tel_Telu')

    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

In [9]:
#training start

In [10]:
def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

cleanup() #cleaning up the memory

model.cuda(); #move model to gpu

#training parameters
optimizer = AdamW(
    [p for p in model.parameters() if p.requires_grad],
    # scale_parameter=False,
    # relative_step=False,
    lr=2e-4,
    # clip_threshold=1.0,
    weight_decay=1e-3,
)

losses = []
batch_size = 8
max_length = 128
warmup_steps = 1_000
training_steps = 6001
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)


# Get the absolute path to the current directory
current_directory = os.getcwd()
# Specify the absolute path to the directory where you want to save the model
MODEL_SAVE_PATH = os.path.join(current_directory, 'nlp/NLLB-model')



In [11]:
#training loop
model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1

        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 500 == 0:
        print(i, np.mean(losses[-10:]))

    if i % 2000 == 0 and i > 0:

        # Create the directory if it doesn't exist
        if not os.path.exists(MODEL_SAVE_PATH):
            os.makedirs(MODEL_SAVE_PATH)

        # Save the model
        try:
            print("Initilized saving models.")
            model.save_pretrained(MODEL_SAVE_PATH)
            tokenizer.save_pretrained(MODEL_SAVE_PATH)
            print("Model and tokinizer saved successfully.")
        except Exception as e:
            print("An error occurred while saving the model:", e)


  0%|          | 0/6001 [00:00<?, ?it/s]

0 2.2076892852783203
500 1.8892095565795899
1000 1.892972755432129
1500 1.8390894532203674
2000 1.6848982572555542
Initilized saving models.
Model and tokinizer saved successfully.
2500 1.6783949375152587
3000 1.741808795928955
3500 1.5335420727729798
4000 1.6552438020706177
Initilized saving models.
Model and tokinizer saved successfully.
4500 1.6685574889183044
5000 1.6354098677635194
5500 1.6064859747886657
6000 1.4995213508605958
Initilized saving models.
Model and tokinizer saved successfully.


In [12]:
def translate(text, src_lang='eng_Latn', tgt_lang='tel_Telu', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    '''translate given english text to telugu'''
    
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

In [13]:
def batched_translate(texts, batch_size=8, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

In [14]:
#translate all the test data from english to telugu
test_df['tel_translated'] = batched_translate(test_df.eng, batch_size=8,src_lang='eng_Latn', tgt_lang='tel_Telu')

  0%|          | 0/1500 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['tel_translated'] = batched_translate(test_df.eng, batch_size=8,src_lang='eng_Latn', tgt_lang='tel_Telu')


In [15]:
def calc_bleu(li):
    #calculate bleu score
    references = li
    references = [[pred.strip()] for pred in references]
    # Convert the translated sentences into lists of words
    candidates = test_df['tel_translated']
    candidates = [pred.strip() for pred in candidates]
    # Calculate the BLEU score
    metric = load_metric("sacrebleu")
    bleu_score = metric.compute(predictions=candidates, references=references)
    # Print the BLEU score
    return bleu_score

#prints the bleu score along with other stats
calc_bleu(test_df['tel'])

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'score': 7.80236414242101,
 'counts': [36987, 10340, 3582, 1360],
 'totals': [95904, 83904, 71924, 60467],
 'precisions': [38.566691691691695,
  12.323607932875667,
  4.980256937878872,
  2.2491606992243702],
 'bp': 0.9134201579757216,
 'sys_len': 95904,
 'ref_len': 104589}

In [16]:
#see tranalations
test_df[:10][['tel', 'eng', 'tel_translated']]

Unnamed: 0,tel,eng,tel_translated
753582,అమెరికా యుద్ధంలో పాల్గొంటుంది\n,USA prepares for war\n,యుద్ధానికి అమెరికా సిద్ధం
1431786,జీవితం అంటే ఎంతో విలువైనది కదా.\n,Life is indeed very precious.\n,జీవితం చాలా విలువైనదే.
2144213,ఉదయం నుంచి సెర్చ్ ఆపరేషన్ జరిగింది.\n,The operation was carried out since morning.\n,ఉదయం నుంచి ఈ ఆపరేషన్ కొనసాగుతోంది.
1075894,పరిశోధన సహాయకుడు\n,Research Assistant\n,రీసెర్చ్ అసిస్టెంట్
3508084,గుజరాత్ రాష్ట్ర జిల్లా హోలోల్\n,District Holol of State of Gujrat\n,గుజరాత్ జిల్లా హోలోల్
650375,అంతా ఢిల్లీ నుంచి నడిపిస్తున్నారు.\n,All are from Delhi.\n,వారంతా ఢిల్లీకి చెందిన వారు.
768196,మేమిద్దరం కలిసి డిన్నర్ చేశాం.\n,Weve had dinner together.\n,మేము కలిసి భోజనం చేశాం.
1809763,కేంద్ర–రాష్ట్ర ప్రభుత్వాలు సంయుక్తంగా దీన్ని చ...,The project is being implemented jointly by th...,"ఈ ప్రాజెక్టును కేంద్ర, రాష్ట్ర ప్రభుత్వాలు సంయ..."
1914719,కార్పొరేట్లకు ప్రధాని మోడీ పిలుపు\n,CEO calls on PM Modi\n,మోడీపై సీఈవో విచారణ
3976185,ఈ ఘటనపై జమ్మూకాశ్మీర్‌ పోలీసులు విచారణ చేపట్టి...,CRPF (Operations) Inspector General Zulfiqar H...,జమ్ముకశ్మీర్ పోలీసులు దర్యాప్తు చేపట్టారని సీఆ...


In [17]:
#done