**updates:**

- added function to ignore tiny images (less than 50x50 pixels) as the Qwen model cannot read them anyway
- function to entirely delete duplicate images
- now uses `nodes.json` files as input instead of `.md` files
- added function to extract singular images: `process_single_image`


**issues and possible improvements/changes:**
- limitation in `find_and_remove_duplicates` function: sometimes visually identical images cannot be considered as 'duplicates' if they have slightly different dimensions, formats, or metadata

# Requirements

In [2]:
!pip install transformers accelerate numpy Requests torch torchvision qwen-vl-utils av ipython reportlab fpdf python-docx pillow huggingface_hub --quiet
!pip install git+https://github.com/huggingface/transformers accelerate --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
from PIL import Image
import uuid
import io
from threading import Thread
# from reportlab.lib.pagesizes import A4
# from reportlab.lib.styles import getSampleStyleSheet
# from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
# from reportlab.lib.units import inch
# from reportlab.pdfbase import pdfmetrics
# from reportlab.pdfbase.ttfonts import TTFont
import docx
# from docx.enum.text import WD_ALIGN_PARAGRAPH
import json

# Qwen functions

In [4]:
# Load the model and processor
model_name = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()

processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

image_extensions = Image.registered_extensions()


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

torch.cuda.empty_cache()


In [6]:
import base64
def identify_and_save_blob(blob_path):
    """Identifies if the blob is an image and saves it."""
    try:
        with open(blob_path, 'rb') as file:
            blob_content = file.read()
            try:
                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
                extension = ".png"  # Default to PNG for saving
                media_type = "image"
            except (IOError, SyntaxError):
                raise ValueError("Unsupported media type. Please upload a valid image.")

            filename = f"temp_{uuid.uuid4()}_media{extension}"
            with open(filename, "wb") as f:
                f.write(blob_content)

            return filename, media_type

    except FileNotFoundError:
        raise ValueError(f"The file {blob_path} was not found.")
    except Exception as e:
        raise ValueError(f"An error occurred while processing the file: {e}")

def decode_base64_to_image(base64_string):
    """Decodes a base64 string and saves it as a temporary image file."""
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))

    filename = f"temp_{uuid.uuid4()}.png"  # Always save as PNG
    image.save(filename)

    return filename

def qwen_inference(media_input, text_input=None):
    torch.cuda.empty_cache()
    # torch.cuda.memory_summary(device=None, abbreviated=False)
    """Handles inference for the selected model."""
    if media_input.startswith("data:image"):  # Detect base64 input
            base64_str = media_input.split(",")[1]  # Remove the base64 header
            media_path = decode_base64_to_image(base64_str)
    elif media_input.endswith(tuple(image_extensions.keys())):  # If it's a file path
            media_path = media_input
    else:
            raise ValueError("Unsupported media type. Please provide a valid image path or base64 string.")
    media_type = "image"
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": media_type,
                    media_type: media_path
                },
                {"type": "text", "text": text_input},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cpu")

    # Run the model in a separate thread
    streamer = TextIteratorStreamer(
        processor.tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        # Remove <|im_end|> or similar tokens from the output
        buffer = buffer.replace("<|im_end|>", "")
        yield buffer

def format_plain_text(output_text):
    """Formats the output text as plain text without LaTeX delimiters."""
    # Remove LaTeX delimiters and convert to plain text
    plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    return plain_text


In [7]:
torch.cuda.empty_cache()

# Image handling

In [8]:
# deleting duplicate images
import hashlib
import os
import re
import glob


def calculate_hash(image_path):
    """Calculate the hash of an image."""
    try:
        with Image.open(image_path) as img:
            img = img.resize((256, 256)).convert("RGB")  # Resize and standardize
            return hashlib.md5(img.tobytes()).hexdigest()
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def find_and_remove_duplicates(folder_path):
    """Find and remove duplicate images in the given folder."""
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return

    hashes = {}
    duplicates = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff")):
                img_hash = calculate_hash(file_path)
                if img_hash:
                    if img_hash in hashes:
                        duplicates.append(file_path)
                    else:
                        hashes[img_hash] = file_path

    # Remove duplicates
    for duplicate in duplicates:
        try:
            os.remove(duplicate)
            print(f"Deleted: {duplicate}")
        except Exception as e:
            print(f"Could not delete {duplicate}: {e}")

    print(f"Total duplicates removed: {len(duplicates)}")

def remove_small_images(image_folder, min_size=50):
    """Deletes images smaller than min_size x min_size pixels."""
    len = 0
    for img_file in glob.glob(os.path.join(image_folder, "*.*")):
        try:
            with Image.open(img_file) as img:
                if img.size[0] < min_size or img.size[1] < min_size:
                    os.remove(img_file)
                    print(f"Deleted: {img_file}")
                    len = len+1
        except Exception:
            continue
    print(f"Total small images removed: {len}")

# Image extraction from nodes + Qwen processing functions

In [9]:
import os
import json
import re
from PIL import Image

def update_json(json_file, json_folder):
    """Updates JSON by removing nodes with missing images and renewing indexes."""

    # Load JSON file
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    filename = os.path.basename(json_file).replace("_all_nodes_with_images.json", "")
    # Get list of actual image files in the same folder
    artifacts_folder = filename + "_artifacts"
    img_folder = os.path.join(json_folder, artifacts_folder)
    valid_images = set(os.listdir(img_folder))  # Images that exist in json_folder


    new_nodes = []
    index_counter = 1

    for node in data["nodes"]:
        if node.get("image_path"):  # If node has an image
            image_name = os.path.basename(node["image_path"])
            if image_name not in valid_images:
                continue  # Skip nodes where the image is missing

        # Trim text field to max 3000 characters
        if "text" in node and isinstance(node["text"], str):
            node["text"] = node["text"][:3000]  # Keep only the first 3000 characters

        node["index"] = index_counter  # Update index sequentially
        new_nodes.append(node)
        index_counter += 1

    # Update JSON metadata
    data["nodes"] = new_nodes
    data["number_of_nodes"] = len(new_nodes)

    # Save new JSON file
    new_json_file = os.path.join(json_folder, os.path.basename(json_file).replace(".json", "_updated.json"))
    with open(new_json_file, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

    print(f"Updated JSON saved as: {new_json_file}")
    return new_json_file



def process_image(image_path, context_text, json_folder):
    """Generate explanation for an image using context from surrounding nodes."""
    media_input = os.path.join(json_folder, image_path)  # Construct full image path

    # Check if the image file exists
    if not os.path.exists(media_input):
        return ""

    if not context_text.strip():
        context_text = ""
        text_input = f"Explain the content of this image."
    else:
        text_input = f"Explain the content of this image. The following context may help: {context_text}"

    print(f"Processing: {image_path} with context.")

    output = list(qwen_inference(media_input, text_input))  # Call inference function
    explanation = format_plain_text(output[-1])

    return explanation

In [10]:
#split json file to max 50 nodes each file

def split_nodes(json_file):
    """Splits a JSON file into smaller JSON files, each containing a max of 50 nodes."""

    # Load the original JSON file
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    if "nodes" not in data or not isinstance(data["nodes"], list):
        raise ValueError("Invalid JSON structure: 'nodes' list not found.")

    base_name = os.path.basename(json_file).replace(".json", "")
    json_folder = os.path.dirname(json_file)
    nodes = data["nodes"]
    output_files = []

    # Split nodes into chunks of 50
    for i in range(0, len(nodes), 50):
        chunk = nodes[i:i+50]

        # Re-index nodes (1 to 50, 51 to 100, etc.)
        for j, node in enumerate(chunk):
            node["index"] = i + j + 1  # Keeps original order but ensures sequential numbering per file

        # Create smaller JSON with updated metadata
        small_json = {
            "file_name": data["file_name"],
            "number_of_nodes": len(chunk),
            "nodes": chunk
        }

        small_json_file = os.path.join(json_folder, f"{base_name}_part_{(i//50) + 1}.json")

        # Save smaller JSON file
        with open(small_json_file, "w", encoding="utf-8") as file:
            json.dump(small_json, file, indent=4, ensure_ascii=False)

        output_files.append(small_json_file)
        print(f"Created: {small_json_file}")

    return output_files  # Return list of created JSON files


def get_context(image_path, nodes):
    """Finds the surrounding text (previous & next nodes) for an image node."""

    for i, node in enumerate(nodes):
        if "image_path" in node and node["image_path"] == image_path:  # Locate image node
            prev_text = nodes[i - 1]["text"] if i > 0 else ""  # Get previous text
            next_text = nodes[i + 1]["text"] if i < len(nodes) - 1 else ""  # Get next text
            return f"{prev_text} {next_text}".strip()  # Combine context

    return ""

In [11]:
def process_split_files(split_files, json_folder):
    """Processes each split JSON file, finds images, and adds explanations."""

    processed_files = []

    for split_file in split_files:
        # Load the split JSON file
        with open(split_file, "r", encoding="utf-8") as file:
            data = json.load(file)

        updated = False  # Track if changes were made

        for node in data["nodes"]:
            if "image_path" in node and node["image_path"]:  # If node has an image
                image_path = os.path.join(json_folder, node["image_path"])

                if os.path.exists(image_path):  # Ensure image exists
                    context = get_context(image_path, data["nodes"])  # Get context
                    explanation = process_image(image_path, context, json_folder)
                    node["explanation"] = explanation  # Update node
                    updated = True  # Mark JSON as updated

        if updated:
            # Save the updated JSON with explanations
            processed_file = split_file.replace(".json", "_processed.json")
            with open(processed_file, "w", encoding="utf-8") as file:
                json.dump(data, file, indent=4, ensure_ascii=False)

            processed_files.append(processed_file)
            print(f"Processed and saved: {processed_file}")

    return processed_files

In [12]:

def process_json_file(json_file):
    """Processes images from a JSON file and updates the JSON with explanations."""
    json_folder = os.path.dirname(json_file)

    # Load JSON file
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    filename = os.path.basename(json_file).replace("_all_nodes_with_images.json", "")

    # image handling
    artifacts_folder = filename + "_artifacts"
    img_folder = os.path.join(json_folder, artifacts_folder)
    find_and_remove_duplicates(img_folder)
    remove_small_images(img_folder)

    # new json file with updated images & indexes
    updated_json = update_json(json_file, json_folder)

    torch.cuda.empty_cache()

    split_json = split_nodes(updated_json)
    processed_split = process_split_files(split_json, json_folder)

    merged_output_file = filename + "_qwen_processed.json"

    # Initialize the merged data structure
    merged_data = {"file_name": "merged_output", "number_of_nodes": 0, "nodes": []}

    # Iterate through each processed JSON file
    for processed_file in processed_split:
        with open(processed_file, "r", encoding="utf-8") as file:
            data = json.load(file)
            merged_data["nodes"].extend(data["nodes"])  # Append nodes

    # Update the total number of nodes
    merged_data["number_of_nodes"] = len(merged_data["nodes"])

    # Save the merged JSON file
    with open(merged_output_file, "w", encoding="utf-8") as file:
        json.dump(merged_data, file, indent=4, ensure_ascii=False)

    print(f"Merged JSON file saved as: {merged_output_file}")

    # Delete processed split files after merging
    for processed_file in processed_split:
        os.remove(processed_file)
        print(f"Deleted: {processed_file}")

    for processed_file in split_json:
        os.remove(processed_file)
        print(f"Deleted: {processed_file}")


In [13]:
# process singular image
def process_single_image(image_path):
    """Generate explanation for an image using context from surrounding nodes."""
    media_input = image_path  # Construct full image path

    # Check if the image file exists
    if not os.path.exists(media_input):
      print(f"The file '{media_input}' does not exist.")
      return ""

    # Open image and check resolution
    with Image.open(media_input) as img:
        width, height = img.size
        if width < 50 and height < 50:  # Skip very small images
            print(f"The file '{media_input}' is too small to process.")
            return ""

    text_input = f"Explain the content of this image."
    print(f"Processing: {image_path}")

    output = list(qwen_inference(media_input, text_input))  # Call inference function
    explanation = format_plain_text(output[-1])

    return explanation

# Begin processing files (nodes.json or singular image files)





In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Example usage: nodes.json file

process_json_file("/content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_all_nodes_with_images.json")


Deleted: /content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_artifacts/image_000008_7a233a57531d7d54d98523821046bcaf2c26cafbe76faf12e86523665031d4fc.png
Deleted: /content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_artifacts/image_000003_7a233a57531d7d54d98523821046bcaf2c26cafbe76faf12e86523665031d4fc.png
Deleted: /content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_artifacts/image_000002_7a233a57531d7d54d98523821046bcaf2c26cafbe76faf12e86523665031d4fc.png
Deleted: /content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_artifacts/image_000011_7a233a57531d7d54d98523821046bcaf2c26cafbe76faf12e86523665031d4fc.png
Deleted: /content/drive/MyDrive/output-imagenodes/2024_11_05_Ferrari_Q3_2024_Results_Press_Release_artifacts/image_000012_7a233a57531d7d54d98523821046bcaf2c26cafbe76faf12e86523665031d4fc.png
Deleted: /content/drive/MyDrive/output-imagen

In [None]:
files = ["/content/drive/MyDrive/output-imagenodes/17_all_nodes_with_images.json",
        "/content/drive/MyDrive/output-imagenodes/231161_OperationsMaintenanceManual_all_nodes_with_images.json",
        "/content/drive/MyDrive/output-imagenodes/PDF1_all_nodes_with_images.json"
]


for file in files:
  process_json_file(file)

Total duplicates removed: 0
Deleted: /content/drive/MyDrive/output-imagenodes/17_artifacts/image_000005_5a0955a531f0009997d3cb2659e8288d4ce5999bfae8a5da04359a9f74b455a0.png
Deleted: /content/drive/MyDrive/output-imagenodes/17_artifacts/image_000004_4623b4124fe5eb97219b501a4673b832f019b8eb43298a00779ffbd8a836c0e4.png
Deleted: /content/drive/MyDrive/output-imagenodes/17_artifacts/image_000001_26f31da1da5cef304ea0d1b264f18dee0f281b9c70603e1dac0c9e3f3e051c25.png
Deleted: /content/drive/MyDrive/output-imagenodes/17_artifacts/image_000000_88f21e2a2e3bdd78e1156c89d591de7a7a5ed7822ff3537eac06f453963e412f.png
Deleted: /content/drive/MyDrive/output-imagenodes/17_artifacts/image_000003_7db414d3c060c2ade9b68f739bdfbb4064919afda0f9b3b42eb89af4ff3311b9.png
Total small images removed: 5
Updated JSON saved as: /content/drive/MyDrive/output-imagenodes/17_all_nodes_with_images_updated.json
Created: /content/drive/MyDrive/output-imagenodes/17_all_nodes_with_images_updated_part_1.json
Processing: /content

In [None]:
files = ["/content/drive/MyDrive/output-imagenodes/ai-in-america-oai-economic-blueprint-20250113_all_nodes_with_images.json",
        "/content/drive/MyDrive/output-imagenodes/creatingsystem_all_nodes_with_images.json"
]


for file in files:
  process_json_file(file)
