<a href="https://colab.research.google.com/github/k-dinakaran/automation-of-wordpress-post-publication-using-AI-tools/blob/main/build_a_content_variation_generator_that_adjusts_for_local_dialects_and_language_nuances.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install langid


Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m62.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941171 sha256=fb2b7dec965b684e588d81c66341891c267719d4ee73c7710d2361fd00828011
  Stored in directory: /root/.cache/pip/wheels/23/c8/c6/eed80894918490a175677414d40bd7c851413bbe03d4856c3c
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [3]:
from transformers import MarianMTModel, MarianTokenizer
import langid
import json

# Define language codes for the MarianMT model
LANG_CODES = {
    "en_US": "en",  # English (US)
    "en_UK": "en",  # English (UK)
    "es_MX": "es",  # Spanish (Mexico)
    "es_ES": "es",  # Spanish (Spain)
    "fr_FR": "fr",  # French (France)
    "hi_IN": "hi",  # Hindi (India)
}

# Custom dialect adjustments
DIALECT_ADJUSTMENTS = {
    "en_UK": {"support": "support", "see you soon": "see you again soon"},
    "es_MX": {"pronto": "pronto"},
    "es_ES": {"pronto": "en breve"},
}

def load_translation_model(lang_code):
    """Load the MarianMT model for the specified language."""
    model_name = f"Helsinki-NLP/opus-mt-{lang_code}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return model, tokenizer

def translate_text(text, src_lang, tgt_lang):
    """Translate text from source language to target language."""
    model, tokenizer = load_translation_model(f"{src_lang}-{tgt_lang}")
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    translation = model.generate(**tokenized_text)
    translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
    return translated_text

def adjust_dialect(text, dialect):
    """Adjust the translated text for specific dialect nuances."""
    adjustments = DIALECT_ADJUSTMENTS.get(dialect, {})
    for key, value in adjustments.items():
        text = text.replace(key, value)
    return text

def generate_variations(input_text):
    """Generate localized content variations."""
    detected_lang, _ = langid.classify(input_text)
    variations = {}

    for dialect, lang_code in LANG_CODES.items():
        if lang_code != detected_lang:
            translated_text = translate_text(input_text, detected_lang, lang_code)
        else:
            translated_text = input_text  # No translation needed for the same language

        # Apply dialect-specific adjustments
        localized_text = adjust_dialect(translated_text, dialect)
        variations[dialect] = localized_text

    return variations

# User Input for Dynamic Content Variation
if __name__ == "__main__":
    print("Welcome to the Content Variation Generator!")
    input_text = input("Please enter the content you'd like to generate variations for: ")

    localized_variations = generate_variations(input_text)
    print("\nLocalized Variations:")
    print(json.dumps(localized_variations, indent=4, ensure_ascii=False))


Welcome to the Content Variation Generator!
Please enter the content you'd like to generate variations for: Thank you for your support! We hope to see you soon.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Localized Variations:
{
    "en_US": "Thank you for your support! We hope to see you soon.",
    "en_UK": "Thank you for your support! We hope to see you again soon.",
    "es_MX": "¡Gracias por su apoyo! Esperamos verle pronto.",
    "es_ES": "¡Gracias por su apoyo! Esperamos verle en breve.",
    "fr_FR": "Merci pour votre soutien! Nous espérons vous voir bientôt.",
    "hi_IN": "आपकी मदद के लिए शुक्रिया!"
}
