<a href="https://colab.research.google.com/github/leaoskr/AIexperimentLand/blob/main/ImageToStory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Installation
!pip install transformers
!pip install openai langchain

#audio
!pip install fairseq
!pip install g2p_en

In [None]:
# @title Generate Application Python File
%%writefile imageToStory.py
import streamlit as st
import urllib.request
from PIL import Image
from transformers import pipeline

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import soundfile as sf

#Method
def imageTotext(url):
  captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
  text = captioner(url)[0]['generated_text']
  return text

def textTostory(short_text):
  generator = pipeline('text-generation', model = 'gpt2')
  story = generator(short_text, max_length = 100, num_return_sequences=3)[0]['generated_text']
  return story

def storyToaudio(story):
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
      "facebook/fastspeech2-en-ljspeech",
      arg_overrides={"vocoder": "hifigan", "fp16": False}
  )
  model = models[0]
  TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
  generator = task.build_generator([model], cfg)

  sample = TTSHubInterface.get_model_input(task, story)
  wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
  return wav, rate

#Title
st.title('AI: Turn Image into Audio Story')

#Display Image
st.header('Input Image')
st.text_input("Image URL", key="image")

try:
  if st.session_state.image is "":
    st.write("Enter a valid image URL")
  else:
    urllib.request.urlretrieve(st.session_state.image, "test.jpg")
    Image.open("test.jpg")
    st.image(st.session_state.image)
except:
  st.error('The URL is not valid', icon="🚨")

#Generate
if 'clicked' not in st.session_state:
    st.session_state.clicked = False

def click_button():
    st.session_state.clicked = True

st.button('Generate', on_click=click_button)

if st.session_state.clicked:
  with st.spinner('Wait for it...'):
    ##1.Image to text
    text = imageTotext(st.session_state.image)
    st.toast('Image to Text: successful', icon='🎉')
    ##2.text to story
    story = textTostory(text)
    st.toast('Text to Story: successful', icon='🎉')
    ##3.story to audio
    wav, rate = storyToaudio(story)
    sf.write('audio.flac', wav, rate)
    st.toast('Story to Audio: successful', icon='🎉')

    #Display Audio
    st.header('Output Audio Story')
    st.audio('audio.flac')

    #Display Text
    st.write(story)

In [None]:
# @title Run Streamlit
!pip install -q streamlit
!npm install localtunnel
!streamlit run imageToStory.py &>log.txt &
!npx localtunnel --port 8501