In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import json
import matplotlib.pyplot as plt
from PIL import Image
from types import SimpleNamespace
import random
import textwrap
import ipywidgets as widgets
from IPython.display import display
import uuid
from datetime import datetime

filename = "sis/val.story-in-sequence.json"
image_folder = "images/val"
output_folder = "story_plots_val_horizontal/"

with open(filename, 'r') as file:
    data = json.load(file)

images_data = data['images']
albums_data = data.get('albums', [])
annotations_data = data['annotations']

images = [json.loads(json.dumps(image), object_hook=lambda d: SimpleNamespace(**d)) for image in images_data]
albums = [json.loads(json.dumps(album), object_hook=lambda d: SimpleNamespace(**d)) for album in albums_data]

annotations = [[json.loads(json.dumps(item), object_hook=lambda d: SimpleNamespace(**d)) for item in annotation_list] for annotation_list in annotations_data]

for img in images[:5]:
    print(f"Title: {img.title}, ID: {img.id}, URL: {img.url_o}")

for album in albums[:5]:
    print(f"Description: {album.description}, ID: {album.id}, Title: {album.title}")

if annotations:
    for annotation in annotations[:5]:
        annotation = annotation[0]
        print(f"Original Text: {annotation.original_text}, Story ID: {annotation.story_id}, Photo ID: {annotation.photo_flickr_id}")

annotations_dict = {}
for annotationArray in annotations:
    for annotation in annotationArray:
        story_id = annotation.story_id
        if story_id not in annotations_dict:
            annotations_dict[story_id] = []
        annotations_dict[story_id].append(annotation)

story_ids = list(annotations_dict.keys())

In [9]:
# import os
# import textwrap
# import matplotlib.pyplot as plt
# from PIL import Image

# def plot_story_images_and_annotations(story_id, save_plot=True):
#     story_annotations = sorted(
#         annotations_dict.get(story_id, []),
#         key=lambda ann: ann.worker_arranged_photo_order
#     )

#     story_images = [
#         img for img in images if img.id in [ann.photo_flickr_id for ann in story_annotations]
#     ]

#     print

#     if story_annotations:
#         story_tier = story_annotations[0].tier
#         story_setting = story_annotations[0].setting
#         story_id = story_annotations[0].story_id
#         plot_title = f"{story_id}\nTier: {story_tier} | Setting: {story_setting}\n\n"
#     else:
#         plot_title = f"{story_id}\nTier: N/A | Setting: N/A"

#     num_images = len(story_images)
#     # Set up the figure with 1 row and num_images columns
#     fig, axes = plt.subplots(nrows=1, ncols=num_images, figsize=(num_images * 4, 6))

#     # Ensure axes is always iterable
#     if num_images == 1:
#         axes = [axes]

#     for ax, annotation in zip(axes, story_annotations):
#         image = next(
#             (img for img in story_images if img.id == annotation.photo_flickr_id),
#             None
#         )

#         if image:
#             image_path = os.path.join(image_folder, f"{image.id}.jpg")
#             if os.path.exists(image_path):
#                 img_data = Image.open(image_path)
#                 ax.imshow(img_data)
#             else:
#                 print(f"Image file {image_path} not found.")
#                 continue
#         else:
#             ax.text(
#                 0.5, 0.5, "Image not found",
#                 horizontalalignment='center', verticalalignment='center',
#                 transform=ax.transAxes
#             )

#         ax.axis('off')
#         # Removed individual captions to avoid overlap
#         # wrapped_text = textwrap.fill(annotation.original_text, width=30)
#         # ax.set_xlabel(wrapped_text, fontsize=10)
    
#     # Collect all captions into one paragraph
#     captions = " ".join([annotation.original_text for annotation in story_annotations])
#     # Wrap the text to fit within the figure width
#     wrapped_captions = textwrap.fill(captions, width=100)

#     fig.suptitle(plot_title, fontsize=14)
#     plt.tight_layout()
#     # Adjust the spacing to make room for the paragraph at the bottom
#     plt.subplots_adjust(bottom=0.2)

#     # Add the paragraph below the images
#     fig.text(
#         0.5,   # X location (centered)
#         0.02,  # Y location (2% from the bottom)
#         wrapped_captions,
#         ha='center',
#         va='bottom',
#         fontsize=16
#     )

#     if save_plot:
#         output_path = os.path.join(output_folder, f"{story_id}.png")
#         plt.savefig(output_path, bbox_inches='tight')
#         print(f"Plot saved for Story ID {story_id} at {output_path}")

#     plt.close(fig)  # Close the figure to avoid display in the notebook

# plot_story_images_and_annotations("40470")
# # Loop through each story_id and save the plot
# # for story_id in story_ids:
# #     print(story_ids)
# #     plot_story_images_and_annotations(story_id)


In [10]:
import os
import torch
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image

def get_story_images(story_id, annotations_dict, images, image_folder):
    story_annotations = sorted(
        annotations_dict.get(story_id, []),
        key=lambda ann: ann.worker_arranged_photo_order
    )
    story_images = [
        img for img in images if img.id in [ann.photo_flickr_id for ann in story_annotations]
    ]
    image_filenames = []
    for image in story_images:
        image_path = os.path.join(image_folder, f"{image.id}.jpg")
        if os.path.exists(image_path):
            image_filenames.append(f"{image.id}.jpg")
        else:
            print("Files not exist", image_path)
    return image_filenames

def qwen_test_all_in_one(story_images, model_name="Qwen/Qwen2-VL-7B-Instruct", image_dir="images/val/", resize_to=(224, 224), device_index=0, query_tmp="print error message"):
    device = torch.device(f"cuda:{device_index}" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="sequential",
        use_cache=False
    ).eval()
    for param in model.parameters():
        param.requires_grad = False
    processor = AutoProcessor.from_pretrained(model_name)
    # query_tmp = "You will be given a sequence of images. For each image input, generate one sentence. This sentence overall must form a single coherent narative story. Generate a story with an enhanced Structure aspect and organize the story using a clear beginning, middle, and end structure."
    processed_images = []
    for img_filename in story_images:
        img_path = os.path.join(image_dir, img_filename)
        try:
            raw_image = Image.open(img_path).convert('RGB')
            raw_image = raw_image.resize(resize_to, Image.LANCZOS)
            processed_images.append(raw_image)
        except:
            pass
    if not processed_images:
        return [], "No valid images were processed."
    conversation_content = [{"type": "image"} for _ in processed_images]
    conversation_content.append({"type": "text", "text": query_tmp})
    conversation = [
        {
            "role": "user",
            "content": conversation_content,
        }
    ]
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(
        text=[text_prompt],
        images=processed_images,
        return_tensors="pt",
        padding=True,
        max_length=256
    )
    inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            num_return_sequences=1,
            do_sample=False
        )
    generated_ids = [
        output_id[len(input_id):]
        for input_id, output_id in zip(inputs['input_ids'], output_ids)
    ]
    story_description = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    return processed_images, story_description

In [11]:
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
import torch
from PIL import Image
import os

def apply_prompt_template(prompt):
    s = (
            '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
            "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
            f'<|user|>\n<image>\n{prompt}<|end|>\n<|assistant|>\n'
        )
    return s

class EosListStoppingCriteria(StoppingCriteria):
    def __init__(self, eos_sequence = [32007]):
        self.eos_sequence = eos_sequence

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
        return self.eos_sequence in last_ids      

def generate_story_from_image_file(image_filename, query, image_dir="images/val/"):

    model_name_or_path = "Salesforce/xgen-mm-phi3-mini-instruct-r-v1"
    with torch.inference_mode():
        model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)
        image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
        tokenizer = model.update_special_tokens(tokenizer)

        model = model.cuda()
        raw_images = []
        for filename in image_filename:
            print("image_dir:" + image_dir)
            print("filename:" + filename)
            image_path = os.path.join(image_dir, filename)
            img = Image.open(image_path).convert('RGB')
            # raw_image = img.resize((224, 224), Image.LANCZOS)
            raw_images.append(img)
            
        inputs = image_processor(raw_images, return_tensors="pt", image_aspect_ratio='anyres')
        prompt = apply_prompt_template(query)
        language_inputs = tokenizer([prompt], return_tensors="pt")
        inputs.update(language_inputs)
        inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
        image_sizes = [img.size for img in raw_images]
        generated_text = model.generate(
            **inputs,
            image_size=image_sizes,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False,
            max_new_tokens=128,
            top_p=None,
            num_beams=1,
            stopping_criteria=[EosListStoppingCriteria()],
        )
        prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True).split("<|end|>")[0]
        return prediction
        


In [None]:
temp_story_ids = story_ids[:20]

for story_id in temp_story_ids:
    story_images = get_story_images(story_id, annotations_dict, images, image_folder)

    story_annotations = sorted(
        annotations_dict.get(story_id, []),
        key=lambda ann: ann.worker_arranged_photo_order
    )
    captions = " ".join([annotation.original_text for annotation in story_annotations])
    # query_generate = "You will be given a sequence of images. For each image input, generate one sentence resulting in a 5 sentence story. Generate a humanlike story with a focus in the character development where the image sequence must generate an impact on the characters inside with a climax that is inevitable and satisfying for the characters."
    # query_enhance = "You will be given a sequence of images and a storytelling caption of that image, based on this caption: '{captions}, Generate a humanlike story with a focus in the character development where the image sequence must generate an impact on the characters inside with a climax that is inevitable and satisfying for the characters."
    # query_generate = "You will be given a sequence of images. For each image input, generate one sentence. Generate a humanlike story with a focus in the character development where the image sequence must generate an impact on the characters inside. Create a climax that is inevitable and satisfying for the characters. This climax must be unexpected, happening in a way the audience could not have anticipated."
    # query_enhance = "You will be given a sequence of images and a storytelling caption of that image, based on this caption: '{captions}, Generate a humanlike story with a focus in the character development where the image sequence must generate an impact on the characters inside. Create a climax that is inevitable and satisfying for the characters. This climax must be unexpected, happening in a way the audience could not have anticipated."
    # query_generate = "You will be given a sequence of images. For each image input, generate one sentence. Generate a humanlike story with an enhanced structure aspect and organize the story using a clear beginning, middle, and end structure."
    # query_enhance = f"You will be given a sequence of images and a storytelling caption of that image, based on this caption: '{captions}, Generate a humanlike story with an enhanced structure aspect and organize the story using a clear beginning, middle, and end structure."
    # query_generate = "You will be given a sequence of images. For each image input, generate one short sentence resulting in a 5 sentence story. Keep each sentence in the story precise and concise. Generate a story with a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. "
    # query_enhance = "You will be given a sequence of images and a story caption of that image, based on this caption: '{captions}, generate one short sentence for each image input, resulting in a 5 sentence story. Generate a story with a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. Keep each sentence in the story precise and concise."
    
    # query_generate = "You will be given a sequence of images. For each image input, generate one sentence. Generate a humanlike story with a proper immersion aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it."

    # query_generate = "You will be given a sequence of images. For each image input, generate one short sentence resulting in a 5 sentence story. Keep each sentence in the story precise and concise. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a story with a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. "
    # query_enhance = "You will be given a sequence of images and a story caption of that image, based on this caption: '{captions}, generate one short sentence for each image input, resulting in a 5 sentence story. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a story with a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. Keep each sentence in the story precise and concise."
    # query_generate = 'You will be given a sequence of images. Generate a story with a proper immersion and structure aspect for each image input, resulting in a 5 sentence story. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image.'

    # query_enhance = 'You will be given a sequence of images and a story caption as the story context foundation. The foundation story is: “{captions}”. Generate a story with a proper immersion and structure aspect for each image input, resulting in a 5 sentence story. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image.'
    # query_generate = 'You will be given a sequence of images. Generate one sentence for each image input, resulting in a 5 sentence story. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure'

    # query_enhance = 'You will be given a sequence of images and a story caption as the story context foundation. The foundation story is: “{captions}”. Generate one sentence for each image input, resulting in a 5 sentence story. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure'
    
    # query_generate = 'You will be given a sequence of images. Generate one sentence for each image input, resulting in a 5 sentence story. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure'
    # query_enhance = 'You will be given a sequence of images and a story caption as the story context foundation. The foundation story is: “{captions}”. Generate one sentence for each image input, resulting in a 5 sentence story. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. Generate a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure'
    # query_generate = 'You will be given a sequence of images. Generate one sentence for each image input, resulting a 5 sentence story. The story must be a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image. '
    # query_enhance = 'You will be given a sequence of images and a story caption as the story context foundation. The foundation story is: “{captions}”. Generate one sentence for each image input, resulting a 5 sentence story. The story must be a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image.'
    # query_generate = "You will be given a sequence of images. For each image input, generate one sentence. Generate a humanlike story with a proper immersion and structure aspect. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A good structure aspect and organized the story using a clear beginning, middle, and end structure"
    # query_generate = 'Generate one sentence that is most relevant for each image input. This sentence combined must form a humanlike story with an enhanced aspect of immersion and structure.
    query_generate = 'You will be given a sequence of images. Generate one sentence for each image input, resulting a 5 sentence story where each sentence must correlate with their respected image input. Make sure the story is a 5 sentence story. The story must be humanlike with a proper immersion and structure aspect. A proper immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. A proper structure aspect and organized the story using a clear beginning, middle, and end structure. Variate between a first person and third person story. You can generate a named entity for the entities detected in the image.'
  
    # query_generate = 'Generate one sentence that is most relevant for each image input. This sentence combined must form a humanlike story with an enhanced aspect of immersion and structure. A proper immersion is a story that has a consistent world-building that have its own consistent rules and logic, ensuring that the world feels real to the reader while they are within it. A proper structure aspect is a story that has a clear beginning, middle and end structure. Variate the story between a first person and third person story. You can also generate a named entity for the entities detected in the image'
    # processed_images, generated_story = qwen_test_all_in_one(story_images, image_dir=image_folder, query_tmp = query_generate)
    # _, enhanced_story = qwen_test_all_in_one(story_images, image_dir=image_folder, query_tmp = query_enhance)

    blip_query = 'Given this story "{generated_story}", do you think it contains a good immersion? A good immersion is a story with a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it. Give me your judgement and reason'
    # blip_query = 'Given the input image story and also this story "{generated_story}", do you think that it is a good story with respect to the input image? Why or why not? Consider the criteria that each sentence must properly explain the related image. So the first sentence must explain the first image, second sentence must explain the second image, and so on. Also consider the aspect of immersion. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensure that the world feels real to the reader while they are within it.'
    blipjudgement = generate_story_from_image_file(image_filename= [], query=blip_query)
    # blip_query = 'Given the input image story and also this story "{generated_story}", evaluate if the immersion aspect is good or not, also give the reason for that evaluation. Evaluate only the text story and not the image sequence. A good immersion is a story that has a consistent World Building of the story. The world must have its own rules and logic. Ensuring that the world feels real to the reader while they are within it.'
    # _, model_judgement = qwen_test_all_in_one(story_images, image_dir=image_folder, query_tmp = blip_query)
    print(blipjudgement)

    # plt.figure(figsize=(20, 10))
    # num_images = len(processed_images)
    # for i, img in enumerate(processed_images, 1):
    #     plt.subplot(2, num_images, i)
    #     plt.imshow(img)
    #     plt.axis('off') 
    #     plt.title(f'Image {i}')
    # plt.subplot(2, 1, 2)
    # plt.text(0.5, 0.7, "Original: "+ captions, horizontalalignment='center', verticalalignment='center', wrap=True, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
    # plt.text(0.5, 0.3, "Generated: "+ generated_story, horizontalalignment='center', verticalalignment='center', wrap=True, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
    # plt.text(0.5, 0 , "Prompt: "+ query_generate, horizontalalignment='center', verticalalignment='center', wrap=True, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
    # plt.axis('off')
    # plt.title(f'Story Description for Story ID: {story_id}')
    # plt.tight_layout()

    # timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    # combined_filename = f'story_combined_{timestamp}_{uuid.uuid4().hex}.png'
    # folder_name = "testing"
    # os.makedirs(folder_name, exist_ok=True)
    # plt.savefig(os.path.join(folder_name, combined_filename))

    # plt.show()
    # plt.close()

Template code from source website

In [None]:
# from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
# import torch
# import requests
# from PIL import Image

# # define the prompt template
# def apply_prompt_template(prompt):
#     s = (
#             '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
#             "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
#             f'<|user|>\n<image>\n{prompt}<|end|>\n<|assistant|>\n'
#         )
#     return s 
# class EosListStoppingCriteria(StoppingCriteria):
#     def __init__(self, eos_sequence = [32007]):
#         self.eos_sequence = eos_sequence

#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
#         last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
#         return self.eos_sequence in last_ids      

# # load models
# model_name_or_path = "Salesforce/xgen-mm-phi3-mini-instruct-r-v1"
# model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)
# image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
# tokenizer = model.update_special_tokens(tokenizer)

# # craft a test sample
# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
# query = "how many images that you take as an input?"

# model = model.cuda()
# inputs = image_processor([raw_image,raw_image,raw_image,raw_image,raw_image], return_tensors="pt")
# prompt = apply_prompt_template(query)
# language_inputs = tokenizer([prompt], return_tensors="pt")
# inputs.update(language_inputs)
# inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
# generated_text = model.generate(**inputs, image_size=[raw_image.size],
#                                 pad_token_id=tokenizer.pad_token_id,
#                                 do_sample=False, max_new_tokens=768, top_p=None, num_beams=1,
#                                 stopping_criteria = [EosListStoppingCriteria()],
#                                 )
# prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True).split("<|end|>")[0]
# print("==> prediction: ", prediction)
# # output: ==> prediction: There is one dog in the picture.
