In [18]:
import os
import torch
import torchvision.models as models
from torchvision import transforms
from dotenv import load_dotenv
from PIL import Image
import requests
import json
import google.generativeai as genai
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from groq import Groq
import re

In [19]:
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")

### Extracting Image captions

In [20]:
# Load the BLIP processor and model
processor = BlipProcessor.from_pretrained("Sof22/image-caption-large-copy")
model = BlipForConditionalGeneration.from_pretrained("Sof22/image-caption-large-copy")

In [21]:
# Function to generate captions for multiple images
def generate_captions(image_paths):
    images = [Image.open(img_path).convert("RGB") for img_path in image_paths]  # Load all images
    inputs = processor(images=images, return_tensors="pt", padding=True)  # Preprocess in batch

    with torch.no_grad():
        caption_ids = model.generate(**inputs)  # Generate captions for all images

    captions = [processor.decode(caption_ids[i], skip_special_tokens=True) for i in range(len(image_paths))]  # Decode captions
    return captions



In [22]:
# Example usage
image_paths = ["hi.jpg", "img.jpg", "imgg.jpg"]  # Replace with actual image paths
captions = generate_captions(image_paths)

# Print the captions
caption = []
for img, cap in zip(image_paths, captions):
    print(f"Image: {img} -> Caption: {cap}")
    caption.append(cap)


Image: hi.jpg -> Caption: there is a man sitting on a chair in a room with a red tie and glasses on his head
Image: img.jpg -> Caption: araffe standing in front of a church with a sky background and a person holding a cell phone
Image: imgg.jpg -> Caption: arafed man in a button down shirt and black pants standing in front of a curtain and smiling


# Integrating LLM for story generation

### Google Gemini

In [25]:
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel("models/gemini-1.5-flash")

#settings of the story
#length
n = input("Enter the lenght of story to generate: ")
#theme
theme = input("Enter the theme of the story: ")

# Generate text
prompt = (f"Write a minimum {n}-word long story about the following context {caption} and give it a touch of {theme}-theme."
            "Provide a response with the combined context of all the captions that make sense. Be creative and imaginative."
            "Start your response from the beginning of the story and conclude it at the end.")

result = model.generate_content(prompt)

result = result.candidates[0].content.parts[0].text
print(result)

The man in the red tie, perched on the worn armchair, adjusted his glasses perched precariously on his head.  The room, dim and dusty, reflected the twilight of his life.  He traced the faded floral pattern on the armchair fabric, a stark contrast to the vibrant red of his tie – a tie he’d worn to his daughter's wedding, a wedding he’d barely remembered, the details blurred by the creeping fog of his dementia.  A photograph sat on the table beside him, a faded image of a giraffe, impossibly graceful, standing before a church. It was a picture his daughter had taken on a trip, a trip he'd longed to join but couldn't.  He remembered her laughter, bright and clear as the summer sky in the background of the photo, a sky he no longer truly saw.  Then, another memory flickered – a younger him, a man in a crisp button-down shirt and black pants, standing before a curtain, smiling a genuine, heartfelt smile.  He was about to go on stage, to play the clarinet, a melody he could no longer recall

### DeepSeek

In [27]:
#settings of the story
#length
n = input("Enter the lenght of story to generate: ")
#theme
theme = input("Enter the theme of the story: ")

client = Groq(api_key=deepseek_api_key)
completion = client.chat.completions.create(
    model="deepseek-r1-distill-llama-70b",
    messages=[
        {
            "role": "user",
            "content": (f"""Write a complete story of exactly {n} words about {caption} with a {theme} theme.
    Story structure: Beginning, Middle, Conclusion
    Omit any <think> tags or internal commentary""")
        }
    ],
    temperature=1.5,
    max_completion_tokens=1024,
    top_p=0.95,
    stream=True,
    reasoning_format="raw"
)

story_output = ""
for chunk in completion:
    story_output += chunk.choices[0].delta.content or ""

# Remove text between <think> and </think>
clean_story = re.sub(r"<think>.*?</think>", "", story_output, flags=re.DOTALL)

# Print the final story
print(clean_story.strip())

Once, there was a man named Harold, who found himself sitting in a dimly lit room, dressed in his usual red tie and glasses, though neither/detail seemed important that day. The room felt particularly empty, save for the quiet creak of the chair he sat on. There was a sense of weight in the air, a gravity he couldn’t escape. Across town, a giraffe stood in front of a church, its elegant neck turned as if scanning the horizon. The sky was pale, a soft pink blending into its grayish tones, an artist’s rendition of neither night nor day. In the foreground, a man stood with a phone in his hand, eternally paused in his pursuit of capturing the moment. 

As the day stretched, the space between purpose and drift widensedened. How anyone or anywhere arrived-there was not so much a mystery as an impossibility in such an intricate design. Behind a closed curtain at an assembly hall nearby—one not so much as approached by daylight—there he was, a buttoned-down figure with darkened teeth speaking.