## **SHORT DESCRIPTION AND STORY GENERATOR IN BOTH TEXT AND AUDIO (IN ENGLISH AND DESIRED LANGUAGE)**

In [11]:
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
from deep_translator import GoogleTranslator
from gtts import gTTS
from IPython.display import Audio, display
import tempfile

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load GPT model for story generation
story_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
story_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

def generate_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt")
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def expand_caption_to_story(caption, max_words=150):
    prompt = f"Write a short, vivid story of about {max_words} words based on this description: '{caption}'\n\nStory:"
    input_ids = story_tokenizer(prompt, return_tensors="pt").input_ids

    output_ids = story_model.generate(
        input_ids,
        max_length=300,
        temperature=0.8,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1
    )

    story = story_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    story = story.split("Story:")[-1].strip()

    return format_text_into_lines(story)

def translate_story(story, target_language):
    translated = GoogleTranslator(source='auto', target=target_language).translate(story)
    return format_text_into_lines(translated)

def format_text_into_lines(text, words_per_line=10):
    words = text.split()
    lines = [" ".join(words[i:i + words_per_line]) for i in range(0, len(words), words_per_line)]
    return "\n".join(lines)

def text_to_audio(text, language_code):
    tts = gTTS(text=text, lang=language_code)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        display(Audio(fp.name, autoplay=True))

def process_image_to_story(image_path, target_language_code):
    caption = generate_caption(image_path)
    print("🖼️ Scene Caption:", caption)

    story = expand_caption_to_story(caption)
    print("📖 Generated Story (EN):\n", story)

    translated_story = translate_story(story, target_language_code)
    print(f"🌍 Translated Story ({target_language_code}):\n", translated_story)

    text_to_audio(translated_story, target_language_code)
if __name__ == "__main__":
    image_path = input("Enter the path to your image file: ")
    target_lang = input("Enter the target language code (e.g., 'fr' for French, 'es' for Spanish, 'hi' for Hindi): ")
    process_image_to_story(image_path, target_lang)


Enter the path to your image file: /28.jpg
Enter the target language code (e.g., 'fr' for French, 'es' for Spanish, 'hi' for Hindi): ta


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🖼️ Scene Caption: a cobbed street with a red rose growing in the middle
📖 Generated Story (EN):
 I am sitting by my pool. It is late. The
sun has set. I am lying on my bed. My
head is in my hands. A dream comes to me.
I am in a room with a door that opens
and closes. I am trying to push the door open
with my hand, but it doesn't move. It is locked.
I can't see the door. I can't see anything. I
close my eyes. I breathe in and out. I wait
for the dream to leave me. I am back in
the street. I am standing in front of the house.
There are two red roses growing in the middle of
the street. The dream is gone. I am back in
the house. My mother is there. She has a red
rose in her hands. I am standing next to her.
My father is standing in the doorway. He is looking
at me. I am in the hallway. I am standing
in front of my bedroom door. The door is open.
The red roses are everywhere. The dream is still with
me. I am standing in the hallway. My mother is
in the doorway. My father is standing next t

In [12]:
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
from deep_translator import GoogleTranslator
from gtts import gTTS
from IPython.display import Audio, display
import tempfile

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load GPT model for story generation
story_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
story_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

def generate_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt")
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def expand_caption_to_story(caption, max_words=150):
    prompt = f"Write a short, vivid story of about {max_words} words based on this description: '{caption}'\n\nStory:"
    input_ids = story_tokenizer(prompt, return_tensors="pt").input_ids

    output_ids = story_model.generate(
        input_ids,
        max_length=300,
        temperature=0.8,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1
    )

    story = story_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    story = story.split("Story:")[-1].strip()

    return format_text_into_lines(story)

def translate_story(story, target_language):
    translated = GoogleTranslator(source='auto', target=target_language).translate(story)
    return format_text_into_lines(translated)

def format_text_into_lines(text, words_per_line=10):
    words = text.split()
    lines = [" ".join(words[i:i + words_per_line]) for i in range(0, len(words), words_per_line)]
    return "\n".join(lines)

def text_to_audio(text, language_code):
    tts = gTTS(text=text, lang=language_code)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        display(Audio(fp.name, autoplay=True))

def process_image_to_story(image_path, target_language_code):
    caption = generate_caption(image_path)
    print("🖼️ Scene Caption:", caption)

    story = expand_caption_to_story(caption)
    print("📖 Generated Story (EN):\n", story)

    translated_story = translate_story(story, target_language_code)
    print(f"🌍 Translated Story ({target_language_code}):\n", translated_story)

    text_to_audio(translated_story, target_language_code)
if __name__ == "__main__":
    image_path = input("Enter the path to your image file: ")
    target_lang = input("Enter the target language code (e.g., 'fr' for French, 'es' for Spanish, 'hi' for Hindi): ")
    process_image_to_story(image_path, target_lang)


Enter the path to your image file: /27.jpg
Enter the target language code (e.g., 'fr' for French, 'es' for Spanish, 'hi' for Hindi): hi


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🖼️ Scene Caption: a kitchen with a large island and a bar stool
📖 Generated Story (EN):
 A kitchen with a large island and a bar stool.
It’s an island that has a huge window on its
back wall, allowing the ocean to pour in on the
other side. On the other side, the ocean is at
least six feet deep, and the kitchen is the size
of a small swimming pool. The kitchen is located on
the ocean side of the home, and you can see
the waves crash into the kitchen from the ocean side.
On the other side, there is the ocean, which is
about two feet deep, and the kitchen is about six
feet deep. The kitchen is one of the smallest in
the house. Its island is about two feet wide, and
its base is about five feet deep. The kitchen has
no stove, and it’s located in the backyard. There is
no window on the kitchen’s front wall. The kitchen has
no floor. The kitchen’s floor is cement or masonry. The
kitchen’s kitchen island has a large hole in it, which
is about five inches in diameter. The kitchen is about
one 