# AI Translation
adapted from src/research/georg/008_ai_translation.ipynb

In [None]:
import pandas as pd

In [None]:
%store -r df_extr_text

In [None]:
df = df_extr_text

### what languages do we have?

In [None]:
from langdetect import detect, LangDetectException


def detect_language(text):
    try:
        # Detect the language of the text
        return detect(text)
    except LangDetectException:
        # Return a placeholder if language detection fails
        return "unknown"


# Apply the language detection function to the 'extr_text' column
df["language"] = df["extr_text"].apply(detect_language)

# Create a count table for the languages
language_counts = df["language"].value_counts()

# Display the language count table
language_counts

In [None]:
# switch to gpu if possible
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

device

In [None]:
# Get the number of GPUs available
num_gpus = torch.cuda.device_count()

print(f"Number of CUDA GPUs available: {num_gpus}")

# List each GPU's name
for i in range(num_gpus):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

In [None]:
# Get the current memory usage and the maximum memory usage in bytes
current_memory_allocated = torch.cuda.memory_allocated(device)
max_memory_allocated = torch.cuda.max_memory_allocated(device)

# Convert bytes to a more readable format, like MB
current_memory_mb = current_memory_allocated / (1024**2)
max_memory_mb = max_memory_allocated / (1024**2)

print(f"Current Memory Allocated: {current_memory_mb} MB")
print(f"Max Memory Allocated: {max_memory_mb} MB")

# Get overall memory usage statistics
total_memory = torch.cuda.get_device_properties(device).total_memory
free_memory = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)


print(f"Total Memory: {total_memory / (1024**2)} MB")

In [None]:
import gc

torch.cuda.empty_cache()
gc.collect()

In [None]:
print(torch.cuda.memory_summary(device=device, abbreviated=False))

In [None]:
# prepare hugging face
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get Hugging Face API key from environment
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Ensure the API key is loaded
if hf_api_key is None:
    raise ValueError(
        "Hugging Face API key not found. Make sure it's set in your .env file as HUGGINGFACE_API_KEY"
    )

In [None]:
# from transformers import pipeline

# WARNING first time this will download model

# Initialize the translation pipeline

# this approach gives less flexibility but is more straight forward
# translator = pipeline('translation', model='facebook/seamless-m4t-v2-large', device=device, use_auth_token=hf_api_key)


from transformers import AutoProcessor, SeamlessM4Tv2Model

# Initialize processor and model
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

In [None]:
model = model.to(device)

In [None]:
# model needs otheer codes:
lang_code_map = {
    "zh-cn": "cmn",
    "ja": "jpn",
    "ko": "kor",
    "de": "deu",
    "fr": "fra",
    "sw": "swe",
    "vi": "vie",
}

In [None]:
from IPython.display import display, Markdown


# function to display long string nicer
def display_text_as_markdown(text):
    # Convert the text to a Markdown formatted string (using triple backticks for code block)
    formatted_text = f"```{text}```"
    # Display the text as Markdown, which will show it in a preformatted style
    display(Markdown(formatted_text))

In [None]:
# chunk text meaningfully

from nltk.tokenize import sent_tokenize


def chunk_text_by_sentence(text, max_length=512):
    sentences = sent_tokenize(text)
    current_chunk = []
    chunks = []

    for sentence in sentences:
        if len(" ".join(current_chunk) + " " + sentence) > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
def translate(text, src_lang):
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt").to(
        device
    )

    # Adjusting generation parameters to prevent repetitive patterns
    output_tokens = model.generate(
        **text_inputs,
        tgt_lang="eng",  # Always translate to English
        generate_speech=False,
        num_beams=5,  # Use beam search with 5 beams
        early_stopping=True,  # Stop when all beams reach EOS token
        no_repeat_ngram_size=2  # Prevent repeating n-grams in the output
    )

    translated_text = processor.decode(
        output_tokens[0].tolist()[0], skip_special_tokens=True
    )

    return translated_text

In [None]:
df_test = pd.DataFrame(
    {
        "ascii_id_company": ["001", "002", "003", "004"],
        "extr_text": [
            "こんにちは、私の犬はかわいいです",
            "这是一段中文文本",
            "여기에 한국어 텍스트가 있습니다",
            "hi baby",
        ],
        "language": ["ja", "zh-cn", "ko", "en"],
    }
)

In [None]:
# Function to apply on each row of the DataFrame
def translate_row(row):
    text = row["extr_text"]
    src_lang = lang_code_map.get(row["language"], row["language"])  # Map language code

    # Return None if the text is already in English
    if src_lang == "en":
        return None
    else:
        # If not English, chunk the text
        chunks = chunk_text_by_sentence(text)

        # Translate each chunk
        translated_chunks = [translate(chunk, src_lang) for chunk in chunks]

        # Combine the translated chunks
        translated_text = " ".join(translated_chunks)

        return translated_text


# Apply the function to each row and create a new column with the translated texts
df_test["translated"] = df_test.apply(translate_row, axis=1)
df_test

In [None]:
# test on real df
df_test = df.head(20)

In [None]:
df_test

In [None]:
display_text_as_markdown(
    df_test.iloc[10, 1]
)  # show what the text to be translated looks like

In [None]:
display_text_as_markdown(df_test.iloc[15, 1])

In [None]:
df_test["translated"] = df_test.apply(translate_row, axis=1)
df_test