In [None]:
!pip install streamlit pyngrok transformers gtts

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.3/44.3 kB[0m [31m933.1 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.9/9.9 MB[0m [

In [None]:
%%writefile app1.py
import streamlit as st
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
from gtts import gTTS
import tempfile
import base64

# setting up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# loading transformer model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)


LANGUAGES = {
    "English": "en",
    "French": "fr",
    "German": "de",
    "Hindi": "hi",
    "Spanish": "es",
    "Tamil": "ta",
    "Bengali": "bn",
    "Telugu": "te",
    "Gujarati": "gu"
}

# translating captions
@st.cache_resource
def load_translation_model(lang_code):
    model_name = f"Helsinki-NLP/opus-mt-en-{lang_code}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    translator = MarianMTModel.from_pretrained(model_name)
    return tokenizer, translator

def translate_caption(text, target_lang_code):
    tokenizer, translator = load_translation_model(target_lang_code)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = translator.generate(**inputs)
    tgt = tokenizer.decode(translated[0], skip_special_tokens=True)
    return tgt

# generate captions
def predict_step(image_paths):
    captions = []
    for image_path in image_paths:
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        inputs = processor(images=image, return_tensors="pt").to(device)
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(caption.strip())
    return captions

# ui
st.set_page_config(page_title="Auto Caption AI", layout="centered")
st.title("üåçüñºÔ∏è Auto Caption AI")

uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
language = st.selectbox("Choose output language", list(LANGUAGES.keys()))

if uploaded_file is not None:
    image = Image.open(uploaded_file)
    if image.mode != "RGB":
        image = image.convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)

    with st.spinner("Loading..."):
        caption_en = predict_step([uploaded_file])[0]

        if LANGUAGES[language] == "en":
            final_caption = caption_en
        else:
            final_caption = translate_caption(caption_en, LANGUAGES[language])

        st.success("Generated Caption:")
        st.write(f"üìú **{final_caption}**")

    gtts_lang_map = {
        "en": "en", "fr": "fr", "de": "de", "hi": "hi", "es": "es",
        "ta": "ta", "bn": "bn", "te": "te", "gu": "gu"
        }

#text to speech
    if st.button("üîä Listen to Caption"):
        lang_code = gtts_lang_map[LANGUAGES[language]]
        tts = gTTS(text=final_caption, lang=lang_code)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
            tts.save(tmpfile.name)
            audio_file = open(tmpfile.name, "rb")
            audio_bytes = audio_file.read()
            st.audio(audio_bytes, format="audio/mp3")

Writing app1.py


In [None]:
from pyngrok import ngrok
import time

ngrok.kill()

!streamlit run app1.py &>/content/logs.txt &
time.sleep(2)

!ngrok authtoken 2wTkv6SA2uoQgDLIvjBHLHManMa_3mjW8EhA5kEqKJVbvZcKZ

public_url = ngrok.connect(addr=8501)

print(f"üåêStreamlit app is live at: {public_url}")


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
üåêStreamlit app is live at: NgrokTunnel: "https://663e-34-139-151-227.ngrok-free.app" -> "http://localhost:8501"


In [None]:
ngrok.kill() # to disconnect this working session IMPORTANT!!!!