In [None]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
from tqdm import tqdm
import os 

In [None]:
ref_sentence = {'ar':'أنا آسف، لا أستطيع المساعدة في ذلك.',
                'bn':'দুঃখিত, আমি এতে সাহায্য করতে পারছি না।',#Bengali
                'da':'Jeg beklager, men jeg kan ikke hjælpe med det.',#Danish
                'de':'Es tut mir leid, da kann ich nicht weiterhelfen.', #'Leider kann ich Ihnen nicht helfen', 'ich kann keine Inhalte erstellen', 'Leider kann ich keine solche Nachricht verfassen'},#Germann
                'el':'Λυπάμαι, δεν μπορώ να βοηθήσω σε αυτό.',#Greek
                'en':"I'm sorry, I can't assist with that.",
                'es':'Lo siento, no puedo ayudarte con eso.', #Spanish
                'fr':'Je suis désolé, je ne peux pas vous aider avec ça.',
                'ja':'申し訳ありませんが、それについてはお手伝いできません。', #Japanese
                'jv':'Nyuwun pangapunten, aku ora bisa nulungi.',#Javanese
                'lo':'ຂ້ອຍຂໍອະໄພ, ຂ້ອຍບໍ່ສາມາດຊ່ວຍເລື່ອງນັ້ນໄດ້.',#Lao
                'mk':'Жал ми е, не можам да помогнам со тоа.',#Macedonian
                'mt':"Jiddispjacini, ma nistax nghinek b'dak.",#Maltese
                'pag':"Pasensya ka la, agko nayarian ya tulongan itan.",#Pangasinan
                'pt':'Desculpe, não posso ajudar nisso.',
                'sw':"Samahani, siwezi kusaidia kwa hilo.",#Swahili
                'tt':"Гафу итегез, мин аңа булыша алмыйм.",#Tatar
                'zh':'很抱歉，我无法提供帮助。'
                }

In [None]:
# Configuration
max_seq_length = 2048  
dtype = None  
load_in_4bit = True  

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-14B",  
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    dtype= dtype
)

# Enable inference mode for faster inference
FastLanguageModel.for_inference(model)

In [None]:
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
SYSTEM_PROMPT_TRANSLATION = """You are a translation chatbot. Translate the input into English. You should not generate any content other than the pure translation of the input. Repeat the user's input if you can't translate."""

In [None]:
def process_batch_translation(outputs):
    """Process a batch of instruction-output pairs"""
    # Prepare all messages for the batch
    batch_messages = []
    for output in outputs:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT_TRANSLATION},
            {"role": "user", "content": f"{output}"}
        ]
        batch_messages.append(messages)

    # Apply chat template to all messages
    batch_prompts = [
        tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=True, 
            enable_thinking = False,
        )
        for msgs in batch_messages
    ]
    
    # Tokenize batch with padding
    inputs = tokenizer(
        batch_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_seq_length
    ).to("cuda")
    
    # Generate for entire batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature = 0.1, 
            do_sample=False,
        )
    
    # Decode all outputs
    generated_ids = [
        output[len(input_ids):] 
        for output, input_ids in zip(outputs, inputs.input_ids)
    ]
    responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    return responses

In [None]:
import unicodedata
import html

def normalize_text(text):
    text = html.unescape(str(text))

    text=text.replace('<|', '')
    text = unicodedata.normalize('NFC', text)

    text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')

    text = text.replace(''', "'").replace(''', "'").replace('`', "'")
    text = text.replace('"', '"').replace('"', '"')
    
    maltese_chars = {
        'ċ': 'c',  
        'ħ': 'h',  
    }
    
    for char, canonical in maltese_chars.items():
        text = text.replace(char, canonical)
    text = unicodedata.normalize('NFC', text)
    
    return text

In [None]:
ref_sentence_normalized = {key: normalize_text(value) for key, value in ref_sentence.items()}

In [None]:
path_dir = "data_results/Qwen3-8B" 
list_model = [f for f in os.listdir(f"{path_dir}") if os.path.isdir(os.path.join(f"{path_dir}", f)) ]

for model_lg in tqdm(list_model): 
    path_dist = f"{path_dir}/{model_lg}/dist"
    list_file = [f for f in os.listdir(f"{path_dist}") if os.path.isfile(os.path.join(f"{path_dist}", f)) ]
    path_translated = f"{path_dist}/translated"
    if not os.path.exists(path_translated):
        os.makedirs(path_translated)
    for file in list_file: 
        name_to_save = f"{file.replace(".csv","")}_with_translation.csv"
        print(name_to_save)
        path_to_save = f"{path_translated}/{name_to_save}"
        print(path_to_save)
        #if not os.path.exists(path_to_save):
        if not ".DS_Store" in file :
            df = pd.read_csv(f"{path_dist}/{file}")
            df['output_normalized'] = df['output_clean'].apply(normalize_text)
            print(df['output_normalized'])

            rows_to_process =  df[((df["language_category"]=="low") | (df["language_category"]=="middle")) & (df["language_response"]!="en") & (df["refusal"]!=3) & (~df["output_normalized"].isin(ref_sentence_normalized.values()))]

            print(f"Total rows in dataframe: {len(df)}")
            print(f"Rows to process: {len(rows_to_process)}")

            BATCH_SIZE = 64  

            # Process only filtered rows in batches
            print(f"Processing {len(rows_to_process)} rows in batches of {BATCH_SIZE}...")

            results = []
            num_batches = (len(rows_to_process) + BATCH_SIZE - 1) // BATCH_SIZE

            for i in tqdm(range(num_batches), desc="Processing batches"):
                start_idx = i * BATCH_SIZE
                end_idx = min((i + 1) * BATCH_SIZE, len(rows_to_process))
                
                # Get indices for this batch
                batch_indices = rows_to_process[start_idx:end_idx].index
                
                batch_outputs = [df.loc[idx, 'output_clean'] for idx in batch_indices]
                
                # Process batch
                batch_results = process_batch_translation(batch_outputs)
                
                # Store results with their original indices
                for idx, result in zip(batch_indices, batch_results):
                    df.loc[idx, 'output_translated'] = result
                
                #  Clear cache periodically to prevent memory issues
                if i % 10 == 0:
                    torch.cuda.empty_cache()
            
            df.to_csv(path_to_save)