# Install Required Dependencies

In [12]:
!pip install --upgrade gradio
!pip install groq
!pip install git+https://github.com/nipponjo/tts_arabic.git

Collecting git+https://github.com/nipponjo/tts_arabic.git
  Cloning https://github.com/nipponjo/tts_arabic.git to /tmp/pip-req-build-_3bwziqk
  Running command git clone --filter=blob:none --quiet https://github.com/nipponjo/tts_arabic.git /tmp/pip-req-build-_3bwziqk
  Resolved https://github.com/nipponjo/tts_arabic.git to commit fb83576a30dd7da32a477300f87f6c532338f2f2
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime-gpu (from tts_arabic==0.0.1)
  Downloading onnxruntime_gpu-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime-gpu->tts_arabic==0.0.1)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu->tts_arabic==0.0.1)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (226.2 MB)
[2K   

# Generate Image Caption

In [13]:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
import gc
class ImageCaptionGenerator():
    def __init__(self, device: str = "cuda"):
        self.caption_model = "microsoft/Phi-3.5-vision-instruct"
        self.model = AutoModelForCausalLM.from_pretrained(
           self.caption_model,
            device_map=device,
            trust_remote_code=True,
            torch_dtype='auto',
            _attn_implementation='eager'
        )
        self.processor = AutoProcessor.from_pretrained(self.caption_model, trust_remote_code=True)

    def load_image(self, image_url: str):
        image = Image.open(requests.get(image_url, stream=True).raw)
        return image
    def generate_caption(self, image: Image.Image, prompt: str):
        messages = [
            {"role": "user", "content": f"<|image_1|> {prompt}"}
        ]
        prompt = self.processor.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = self.processor(prompt, image, return_tensors="pt").to("cuda:0")
        generation_args = {
            "max_new_tokens": 1000,
            "temperature": 1,
            "do_sample": False
        }
        generate_ids = self.model.generate(
            **inputs,
            eos_token_id=self.processor.tokenizer.eos_token_id,
            **generation_args
        )
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response = self.processor.batch_decode(
            generate_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]
        del inputs
        del generate_ids
        torch.cuda.empty_cache()
        return response

    def cleanup(self):
        del self.model
        del self.processor
        torch.cuda.empty_cache()
        gc.collect()

    def run(self, image_url: str, prompt: str):
        image = self.load_image(image_url)
        caption = self.generate_caption(image, prompt)
        return caption

caption_generator = ImageCaptionGenerator("cuda")
image_url = "https://t4.ftcdn.net/jpg/04/15/79/09/360_F_415790935_7va5lMHOmyhvAcdskXbSx7lDJUp0cfja.jpg"
prompt = "Describe the picture in detail"
caption = caption_generator.run(image_url, prompt)
print(f"Generated Caption: {caption}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generated Caption: The image captures a vibrant scene from the African savanna. Dominating the foreground is a majestic elephant, its skin a textured tapestry of gray and brown. The elephant is not alone; it is part of a larger gathering of wildlife. A group of zebras, their black and white stripes contrasting sharply with the golden grass, are huddled together, perhaps seeking warmth or protection. A few lions, their tawny coats blending with the surroundings, are also part of this diverse assembly.

In the background, a giraffe stands tall, its long neck reaching towards the sky. Its spotted coat is a beautiful mosaic of browns and whites. The giraffe's height is a stark contrast to the other animals, yet it is an integral part of this wildlife community.

The savanna itself is a sea of golden grass, swaying gently in the breeze. The grassland extends into the distance, where a mountain looms under a clear blue sky. The mountain's peak is shrouded in clouds, adding a sense of mystery

In [14]:
caption_generator.cleanup()

# Translate Caption into Arabic

In [15]:
from groq import Groq
class ArabianKingTranslator:
    def __init__(self, use_groq: bool = False):
        self.use_groq = use_groq
        if not self.use_groq:
            raise ValueError("Please use Groq for this translation. Hugging Face is not configured.")
        else:
            self.client = Groq(api_key="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")

    def generate_translation(self, messages: list) -> str:
        if self.use_groq:
            inst_prompt = """You serve as the professional and trusted translator for an Arabian king. Your duty is to translate everything Americans say from English into the king’s language—formal, precise, and elegant Arabic. You must ensure that no English words or terms remain in the translation. The Arabic you provide should be clear, culturally respectful, and fully aligned with the royal standards of communication. Provide only the Arabic translation without any additional explanation or text."""

            chat_completion = self.client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": f"{inst_prompt} text to translate: {messages[-1]['content']}",
                    }
                ],
                model="gemma2-9b-it"
            )
            translated_text = chat_completion.choices[0].message.content
            return translated_text
        else:
            raise ValueError("Groq usage is disabled")

    def run(self, messages: list) -> str:
        translated_text = self.generate_translation(messages=messages)
        return translated_text
# Example usage
messages = [
    {"role": "system", "content": "You serve as the professional and trusted translator for an Arabian king. Your duty is to translate everything Americans say from English into the king’s language—formal, precise, and elegant Arabic. You must ensure that no English words or terms remain in the translation. The Arabic you provide should be clear, culturally respectful, and fully aligned with the royal standards of communication, Provide only the Arabic translation without any additional explanation or text."},
    {"role": "user", "content": """In the background, a giraffe stands tall, its long neck reaching towards the sky. Its spotted coat is a beautiful mosaic of browns and whites. The giraffe's height is a stark contrast to the other animals, yet it is an integral part of this wildlife community.
                                The savanna itself is a sea of golden grass, swaying gently in the breeze. The grassland extends into the distance, where a mountain looms under a clear blue sky. The mountain's peak is shrouded in clouds, adding a sense of mystery to the scene.
                                Above it all, the sky is a canvas of blue, dotted with a few clouds. The clouds are far away, their details lost in the distance. They add a sense of depth to the image, creating a sense of vastness and openness.
                                Overall, the image is a snapshot of life in the African savanna, a moment frozen in time that showcases the beauty and diversity of wildlife. It's a testament to the harmony that exists in nature, where different species coexist and thrive together."""},
]
# With Groq
groq_translator = ArabianKingTranslator(use_groq=True)
groq_translation = groq_translator.run(messages)
print("Translation:", groq_translation)

Translation: في الخلفية، يقف زرافة شَطِمة، رقبةُها الطويلة تميل إلى السماء. فَصْلَةُها المُرصَّعة هي لوحة جميلة من البُرونز والأبيض. يبرز طول الزرافة من بين الحيوانات الأخرى، وكِنه جزء لا غُنى عنه من هذه المجتمع حيّة النّادرة.

تتحوَّل السافانا نفسها إلى بحر من العشب الذهبي، يهزّه النّفسُ بهدوء. تَنسّب الع草 إلى المسافة البعيدة، حيث تَرْتَمِس جَبَلٌ تحت سماء زرقاء صافية. غُيَّظ قمة الجبل بِالسِّحَابِ، مما يَضْفُ إلى المشهدُ حِسّاً من الغَموض.

وَفوق كلّ ذلك، السّماءُ هي لوحةٌ زَرْقاء، مزخرفة بِعددٍ قليلٍ من السِّحَاب. السِّحَاب بعيدة، تفاصيلُها تُخفي بالنّظَر البعيد، وتُضْفُ حِسّاً من العمق إلى الصورة، مُنشِئَة حِسّاً اتّساعٍ والهَوال.

بِنصفِ الكل، تُمثِّلُ الصورة آنيةً من الحياة في السافانا الافريقية، مَوْقِعٌ مُجمَّد في الزّمان الذي يُسَطِّرُ جمَال التّنوع الحيّ. إنَّها شهادةٌ على التَّوازُنِ الذي يوجد في الطبيعة، حيث يعيش العديد من المُصنّفات الحيّّة معاً و ينافُسُون في النّمو.  



# Arabic Text To Speech

In [16]:
import numpy as np
from IPython.display import Audio
from tts_arabic import tts  # Ensure you have this module available

class ArabicTextToSpeech:
    def __init__(self, speed, path):
        self.speaker = 1
        self.speed = speed
        self.path = path

    def run(self, text):
        # Generate the audio waveform and save it to the specified path
        tts(text, vowelizer='shakkelha', speaker=self.speaker, save_to=self.path, pace=self.speed)
        return Audio(self.path)  # Return the Audio object for playback

# Create an instance of the ArabicTextToSpeech class
tts_instance = ArabicTextToSpeech(1.2, "text.mp3")

# Arabic text to be synthesized
text_Arabic = """
بالصورة يتم التقاط شخص في خضم التقاط لحظة. يقف في حديقة، محاطًا بهدوء الطبيعة.
يرتدي الفرد سترة صفراء زاهية، تبرز بشكل واضح على خلفية الخضرة الكثيفة.
في يديه، يحمل كاميرا، مستعدًا لالتقاط صورة للمنظر الطبيعي الهادئ أمامه.
الأشجار في الخلفية، بأوراقها التي تتراوح بين الأخضر والأصفر، تشير إلى أن الصورة التقطت خلال فصل الخريف.
وضعية الشخص في الإطار واتجاه نظره يشيران إلى أنه ينظر نحو الجانب الأيمن من الصورة.
المشهد العام هو من العزلة الهادئة والفرح البسيط للتصوير الفوتوغرافي.
"""

# Run the TTS process and get the audio output
audio_output = tts_instance.run(text_Arabic)

# Display the audio output
display(audio_output)


Downloading...
From (original): https://drive.google.com/uc?id=1pD210QTN1IL3CTA1D65ldKB7ooZ2hANl
From (redirected): https://drive.google.com/uc?id=1pD210QTN1IL3CTA1D65ldKB7ooZ2hANl&confirm=t&uuid=f97914c3-5f46-441b-9ff9-d52a71519c44
To: /usr/local/lib/python3.10/dist-packages/tts_arabic/data/fp_ms.onnx
100%|██████████| 187M/187M [00:01<00:00, 101MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rZxulMhjrlQDheoGy7xnlWGjFYyjF9Gz
To: /usr/local/lib/python3.10/dist-packages/tts_arabic/data/hifigan.onnx
100%|██████████| 56.4M/56.4M [00:00<00:00, 127MB/s]
Downloading...
From: https://drive.google.com/uc?id=1XWgV7F7eQdRy-KTvCteyXVXAQoNIRa7z
To: /usr/local/lib/python3.10/dist-packages/tts_arabic/data/denoiser.onnx
100%|██████████| 8.42M/8.42M [00:00<00:00, 61.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1scpaMnVLjrDkGBL239pWeb7QW76b15W1
To: /usr/local/lib/python3.10/dist-packages/tts_arabic/data/shakkelha.onnx
100%|██████████| 10.8M/10.8M [00:00<00:00, 161MB/s]


# Integrated PipeLine
## Image To Speech

In [17]:
class ImageToSpeechPipeline:
    def __init__(self):
        # Initialize the caption generator
        self.caption_generator = ImageCaptionGenerator("cuda")
        # Initialize the translator
        self.translator = ArabianKingTranslator(use_groq=True)
        # Initialize the text-to-speech generator
        self.tts = ArabicTextToSpeech(speed=1.2, path="text.mp3")
    def process_image_to_speech(self, image_url: str, prompt: str):
        # Step 1: Generate the caption from the image
        caption = self.caption_generator.run(image_url, prompt)
        self.caption_generator.cleanup()
        # Step 2: Translate the caption to Arabic
        messages = [
        {"role": "system", "content": "You serve as the professional and trusted translator for an Arabian king. Your duty is to translate everything Americans say from English into the king’s language—formal, precise, and elegant Arabic. You must ensure that no English words or terms remain in the translation. The Arabic you provide should be clear, culturally respectful, and fully aligned with the royal standards of communication, Provide only the Arabic translation without any additional explanation or text."},
        {"role": "user", "content": caption}
    ]
        arabic_translation = self.translator.run(messages)
        # Step 3: Convert the Arabic text to speech
        audio = self.tts.run(arabic_translation)
        return audio
# Create an instance of the pipeline
pipeline = ImageToSpeechPipeline()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
# Example usage:
image_url = "https://cdn.pixabay.com/photo/2024/05/26/10/15/bird-8788491_1280.jpg"
# Process the image and generate speech
audio = pipeline.process_image_to_speech(image_url, prompt)
display(audio)

# Web Deployment

In [21]:
import gradio as gr
# Define Gradio function
def gradio_process_image(image):
    return pipeline.process_image_to_speech(image)

# Gradio Interface
# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_process_image,
    inputs=gr.Image(type="filepath", label="Upload Image"),
    outputs=gr.Audio(label="Generated Audio"),
    title="Speech to Image Converter",
    description="Upload an image, and the model will generate a speech description in Arabic.",
)

# Launch the Gradio app
iface.launch()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fd8fc01622d622a2ab.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


