In [2]:
#!pip install transformers pandas torch
!pip install sentencepiece


Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Ensure necessary packages are installed
# !pip install transformers pandas torch sentencepiece

import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

def load_model_and_tokenizer(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

def translate_text(texts, tokenizer, model):
    translated_texts = []
    for text in texts:
        try:
            # Ensure the input is a string
            text = str(text)
            
            # Tokenize the text
            inputs = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True)
            
            # Generate translation
            translated_tokens = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
            
            # Decode the translation
            translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
            
            translated_texts.append(translation)
        except Exception as e:
            print(f"Error translating text: {text}. Error: {e}")
            translated_texts.append(text)  # Append the original text if translation fails
    return translated_texts

def translate_file(input_file, output_file, model_name):
    # Load the data
    data = pd.read_csv(input_file, sep='\t')
    
    # Convert all entries in the 'en' column to strings
    data['en'] = data['en'].astype(str)
    
    # Load the pre-trained model and tokenizer
    tokenizer, model = load_model_and_tokenizer(model_name)
    
    # Translate the text in the 'en' column
    translated_texts = translate_text(data['en'].tolist(), tokenizer, model)
    
    # Replace the original text with the translated text
    data['en'] = translated_texts
    
    # Save the translated data to a new TSV file
    data.to_csv(output_file, sep='\t', index=False)
    print(f"Translation completed and saved to {output_file}")

# Define model name for English to Spanish translation
english_to_spanish_model = 'Helsinki-NLP/opus-mt-en-es'

# Translate English to Spanish
input_file_path = '/home/u23/afraaalshammari/Transl/SpaD/New_IR_Data/Round2/TransTask1/Transformers/en_annotations_binary_complete.tsv'
output_file_path = '/home/u23/afraaalshammari/Transl/SpaD/New_IR_Data/Round2/TransTask1/Transformers/Transformers_En_ES_complete.tsv'
translate_file(input_file_path, output_file_path, english_to_spanish_model)




Translation completed and saved to /home/u23/afraaalshammari/Transl/SpaD/New_IR_Data/Round2/TransTask1/Transformers/Transformers_En_ES_complete.tsv


In [6]:
# Test sentencepiece installation
import sentencepiece as spm
print(spm.__version__)


0.2.0
