In [None]:
!pip install gradio langchain langchain-community transformers torch bitsandbytes pyht requests sentencepiece

Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyht
  Downloading pyht-0.1.6-py3-none-any.whl.metadata (8.6 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub

In [None]:

import os
import torch
import requests
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain import HuggingFacePipeline, LLMChain
from langchain.prompts import PromptTemplate
from pyht import Client
from pyht.client import TTSOptions
from PIL import Image
from io import BytesIO
import gradio as gr

access_token = "YOUR-ACCESS-TOKEN"
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# Configure Bits and Bytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Load the tokenizer and model for text generation
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=access_token,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Set up the text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_new_tokens=500,
    pad_token_id=tokenizer.eos_token_id,
    device_map="auto",
    temperature=1.0,
    top_k=50,
    top_p=0.9
)
llm = HuggingFacePipeline(pipeline=pipe)

# Function to generate text from an image
def generate_text_from_image(url: str) -> str:
    device = 0 if torch.cuda.is_available() else -1
    image_to_text = pipeline("image-t1o-text", model="Salesforce/blip-image-captioning-base", device=device)

    generated_text = image_to_text(url)[0]["generated_text"]
    return generated_text

# Function to generate a story
# Function to generate a complete children's story
def generate_story(image_url: str, user_input: str = "") -> str:
    image_text = generate_text_from_image(image_url)
    random_twist = random.choice([
        "a hidden treasure is discovered in the forest",
        "a magical bird helps the characters solve a riddle",
        "a secret door leads to a wonderful land of sweets",
        "a wise owl teaches them the value of friendship",
        "a sparkling rainbow lights up the sky and grants a wish",
    ])

    if user_input:
        template = """
        You are a creative storyteller specializing in engaging children's stories. Generate a complete story with the following requirements:
        - The story must have a clear beginning, middle, and end.
        - The characters should face an exciting challenge or adventure.
        - The ending must resolve all conflicts and leave the reader with a happy or thoughtful conclusion.

        CONTEXT: {scenario}
        USER INPUT: {user_input}
        RANDOM TWIST: {random_twist}

        Create the story below:
        BEGINNING:
        Start by introducing the characters and setting.

        MIDDLE:
        Describe their adventure or problem they encounter.

        END:
        Conclude with how the problem is resolved or how the adventure ends positively.

        STORY:
        """
        input_variables = ['scenario', 'user_input', 'random_twist']
        prompt = PromptTemplate(template=template, input_variables=input_variables)
        story = LLMChain(llm=llm, prompt=prompt, verbose=True)({"scenario": image_text, "user_input": user_input, "random_twist": random_twist})['text']
    else:
        template = """
        You are a creative storyteller specializing in engaging children's stories. Generate a complete story with the following requirements:
        - The story must have a clear beginning, middle, and end.
        - The characters should face an exciting challenge or adventure.
        - The ending must resolve all conflicts and leave the reader with a happy or thoughtful conclusion.

        CONTEXT: {scenario}
        RANDOM TWIST: {random_twist}

        Create the story below:
        BEGINNING:
        Start by introducing the characters and setting.

        MIDDLE:
        Describe their adventure or problem they encounter.

        END:
        Conclude with how the problem is resolved or how the adventure ends positively.

        STORY:
        """
        input_variables = ['scenario', 'random_twist']
        prompt = PromptTemplate(template=template, input_variables=input_variables)
        story = LLMChain(llm=llm, prompt=prompt, verbose=True)({"scenario": image_text, "random_twist": random_twist})['text']

    return story.split('STORY:')[-1].strip()


# Set up the TTS client
client = Client(
    user_id="YOUR-USER-ID",
    api_key="YOUR-API-KEY"
)
tts_options = TTSOptions(voice="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json")

# Function to generate and save speech from story
def generate_speech_from_story(story: str) -> str:
    audio_path = "story_audio.mp3"
    with open(audio_path, "wb") as f:
        for chunk in client.tts(story, tts_options):
            f.write(chunk)
    return audio_path

# Define Gradio UI
def process_story(image_url, user_input):
    story = generate_story(image_url, user_input)
    audio_file = generate_speech_from_story(story)
    return story, audio_file




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import gradio as gr

theme = gr.themes.Soft()

with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# VisionVoice 🎨📜")
    gr.Markdown("Generate a creative story and audio narration from an image URL!")

    with gr.Row():
        image_url = gr.Textbox(label="Image URL", placeholder="Enter image URL...")
        user_input = gr.Textbox(label="User Input (Optional)", placeholder="Enter a custom prompt for the story...")

    with gr.Row():
        story_output = gr.Textbox(label="Generated Story", lines=10, interactive=False)
        audio_output = gr.Audio(label="Generated Audio", type="filepath")

    generate_button = gr.Button("Generate Story")
    generate_button.click(process_story, inputs=[image_url, user_input], outputs=[story_output, audio_output])

# Launch the app
demo.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4ba8740a8901f89ca8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
        You are a creative storyteller specializing in engaging children's stories. Generate a complete story with the following requirements:
        - The story must have a clear beginning, middle, and end.
        - The characters should face an exciting challenge or adventure.
        - The ending must resolve all conflicts and leave the reader with a happy or thoughtful conclusion.

        CONTEXT: cartoon scene of a garden with a boy and girl illustration for children stock illustration
        USER INPUT: Name the characters Tom and Lily
        RANDOM TWIST: a magical bird helps the characters solve a riddle

        Create the story below:
        BEGINNING:
        Start by introducing the characters and setting.

        MIDDLE:
        Describe their adventure or problem they encounter.

        END:
        Conclude with how the problem is resolved or how the adventure ends positively.

    




[1m> Finished chain.[0m
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4ba8740a8901f89ca8.gradio.live


