In [None]:
# Install and upgrade necessary libraries
!pip install --upgrade transformers datasets huggingface_hub

In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('language_translation')

In [None]:
# Check for CUDA availability and set the device accordingly (GPU or CPU)
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
# Log in to Hugging Face Hub for model and dataset access

from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Load the opus_books dataset for English-French translation
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")

In [None]:
books

In [None]:
# Prepare test data by formatting English sentences
test_data = ['###en: ' + books['train'][i]['translation']['en'] +  ' ###fr:'  for i in range(10000,10100)]

In [None]:
# Prepare training data by formatting English and French sentence pairs
train_data = ['###en: ' + books['train'][i]['translation']['en'] +  ' ###fr: ' + books['train'][i]['translation']['fr'] for i in range(len(books['train']))]

In [None]:
# Import train_test_split for splitting data
from sklearn.model_selection import train_test_split

In [None]:
# Split the data into training and validation sets
train_data, valid_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [None]:
# Limit the training data size
train_data = train_data[:10000]

In [None]:
# Limit the validation data size
valid_data = valid_data[:2000]

In [None]:
# Import AutoTokenizer and AutoModelForCausalLM from transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# Specify the pre-trained model name
model_name = "meta-llama/Llama-3.2-1B"  # or the actual model path

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [None]:
# Set the padding token to the end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Import necessary PyTorch and Dataset classes
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [None]:
# Define a custom dataset class for llama model training
class translationDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.all_input_ids = []
        self.all_labels = []

        for i in data:
          tokenized_sentence = tokenizer(i)
          if len(tokenized_sentence['input_ids']) <= 650:
            self.all_input_ids.append(tokenized_sentence['input_ids'] + [tokenizer.eos_token_id])
            for j in range(len(tokenized_sentence['input_ids'])):
                if tokenized_sentence['input_ids'][j] == 17010 and tokenized_sentence['input_ids'][j+1] == 1658  and tokenized_sentence['input_ids'][j+2] == 25:
                  k = j+3
                  tokenized_sentence['input_ids'] = [-100] * (k) + tokenized_sentence['input_ids'][k:]
                  self.all_labels.append(tokenized_sentence['input_ids']+ [tokenizer.eos_token_id])
        print(max([len(i) for i in self.all_input_ids]))


    def __len__(self):
        return len(self.all_input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.all_input_ids[idx]), torch.tensor(self.all_labels[idx])

In [None]:
# Create a training dataset instance
train_dataset = translationDataset(train_data, tokenizer)

In [None]:
# Create a validation dataset instance
valid_dataset = translationDataset(valid_data, tokenizer)

In [None]:
# Define a Data Collator class for padding tensors
class DataCollator:
        def __init__(self, tokenizer):
            self.tokenizer = tokenizer

        def pad_tensors(self, tensors, padding_value=0):

            return torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True, padding_value=padding_value)

        def __call__(self, data):
            output_dict = {'input_ids':[f[0] for f in data],'labels': [f[1] for f in data]}
            output_dict['input_ids'] = self.pad_tensors(output_dict['input_ids'],
                                                        padding_value=self.tokenizer.pad_token_id)
            output_dict['labels'] = self.pad_tensors(output_dict['labels'],
                                                        padding_value=-100)


            return output_dict

In [None]:
# Create a Data Collator instance
data_collator = DataCollator(tokenizer=tokenizer)

In [None]:
# Import TrainingArguments from transformers
from transformers import TrainingArguments

In [None]:
# Define training arguments for the Trainer
training_args = TrainingArguments(report_to = "none",
                                  per_device_train_batch_size = 3,
                                  gradient_checkpointing = True,
                                  num_train_epochs = 1,
                                  eval_strategy = 'epoch',
                                  per_device_eval_batch_size= 3,
                                  overwrite_output_dir=True,
                                  save_steps=1000,
                                  bf16=True,
                                  gradient_accumulation_steps=2,
                                  logging_steps = 1,
                                  logging_strategy="steps"
                                  )

In [None]:
# Import Trainer from transformers
from transformers import Trainer

In [None]:
# Initialize the Trainer with model, training arguments, datasets, data collator, and tokenizer
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
# Start training the model
trainer.train()

In [None]:
# Save the trained model
trainer.save_model("language_translation/MyDrive/Colab notebooks/language_translation")

# Validation

In [None]:
# Generate text for each item in the test data
generated_text = []
for i in test_data:
  # Encode input text into token IDs
  input_ids = tokenizer.encode(i, return_tensors='pt')  # `pt` for PyTorch
  # Generate text using the model
  input_ids = input_ids.to(model.device)
  generated_ids = model.generate(
      input_ids,                # Input tokens
      num_return_sequences=1,   # Number of sequences to generate
      no_repeat_ngram_size=2,   # Prevent repetition of n-grams of size 2
      # temperature=0.01,           # Control the randomness (lower = more deterministic)
      # top_p=0.9,                # Nucleus sampling
      top_k=50,                 # Top-K sampling
      do_sample=False,           # Use sampling, not greedy decoding
      pad_token_id=tokenizer.eos_token_id  # Padding token (important for GPT-2 and similar models)
  )
  # Decode the generated token IDs back to text
  generated_text.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))


In [None]:
# Process generated text to extract actual and predicted sentences
predictions =[]
for i in generated_text:
  sen ={}
  a= i.split(' ###fr: ')
  b= a[0].replace('###en: ', "")
  predictions.append({"actual_english_sentence": b, "predicted_french_sentence": a[1]})


In [None]:
# Import necessary libraries for using Google Generative AI
import time
import os
from pprint import pprint
from google import genai
from google.genai import types
client = genai.Client(api_key="AIzaSyCMWJgdb1KlR0Te7L489SfPxguz5V63bIY")
questions = []
count = 0

# Define the generation configuration
generation_config = types.GenerateContentConfig(
    response_mime_type="application/json"
)
# Use the Generative AI model to evaluate translations
for i in predictions:
  prompt = f"""You are a very critical judge of one of the translation task where you will be given a pair of actual english sentences and predicted french sentences in the format {{"actual_english_sentence": ..., "predicted_french_sentence": ...}},  now your job is to predict whether the predicted french sentences corresponding to english sentences are correct or not.\nYou output should be in the format of list like this - [{{"correct_translation": "Yes/No", 'reason':'...'}}]". Pair of text: {i}"""
  response = client.models.generate_content(model='gemini-2.5-flash-preview-05-20', contents=prompt, config=generation_config)
  questions.append(response.text)
  time.sleep(5)


In [None]:
# Import ast for safely evaluating strings containing Python literals
import ast
result = []
for i in questions:
  result.append(ast.literal_eval(i))

# Print the result
print(result)

In [None]:
# Count the number of 'Yes' and 'No' correct_translation entries
flattened = [item for sublist in result for item in sublist]
# Initialize counters
yes_count = 0
no_count = 0

# Count Yes and No
for entry in flattened:
    if entry['correct_translation'] == 'Yes':
        yes_count += 1
    elif entry['correct_translation'] == 'No':
        no_count += 1

print(yes_count, no_count)