In [6]:
pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 266 kB/s eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.67.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import os
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
from tqdm import tqdm  # Import tqdm for progress bars

# Directory to save images
image_dir = './LlaVA_img_train'
os.makedirs(image_dir, exist_ok=True)

# Load the original JSON data
with open('/workspace/First_turn_vlt5/test_dataset_bert_first_turn.json', 'r') as f:
    data = json.load(f)

# Prepare the JSONL output
output_lines = []
mew= []

def download_and_resize_images(img_ids):
    local_image_paths = []
    for idx, img_id in enumerate(tqdm(img_ids, desc="Downloading/Resizing Images", leave=False), start=1):
        local_path = os.path.join(image_dir, img_id)

        if os.path.exists(local_path):
            # Check size to skip already resized images
            try:
                with Image.open(local_path) as image:
                    if image.size == (384, 384):
                        # print(f"Already resized: {img_id}")
                        local_image_paths.append(local_path)
                        continue
                    # print(f"Resizing already downloaded: {img_id}")
                    image = image.convert("RGB")
                    image = image.resize((384, 384))
                    image.save(local_path)
                    local_image_paths.append(local_path)
            except UnidentifiedImageError:
                print(f"Skipping unidentifiable image file: {local_path}")
            continue

        url = f"https://xmrec.github.io/mturk_images/all_images/{img_id}"
        
        # Download and save the image
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            # Open the image and resize it
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = image.resize((384, 384))
            image.save(local_path)
            
            local_image_paths.append(local_path)
            # print(f"Downloaded and resized {idx}/{len(img_ids)}: {img_id}")
        except requests.HTTPError as e:
            # print(f"Skipping failed download {url}: {e}")
            # raise RuntimeError
            continue
        except UnidentifiedImageError:
            # print(f"Skipping unidentifiable downloaded image: {img_id}")
            # raise RuntimeError
            continue
    return local_image_paths

def concatenate_images(image_paths, mew):
    images = [Image.open(p) for p in image_paths]
    widths, heights = zip(*(i.size for i in images))
    image_width, image_height = images[0].size  # Assuming all images have the same size after resizing (384x384)

    # Pad the images list to have 12 images
    if len(images) < 12:
        padding_image = Image.new("RGB", (image_width, image_height), (0, 0, 0))  # Black padding image
        for _ in range(12 - len(images)):
            images.append(padding_image)

    # Update dimensions after padding
    widths, heights = zip(*(i.size for i in images))
    total_width = sum(widths)
    max_height = max(heights)

    new_image = Image.new("RGB", (total_width, max_height), (0, 0, 0))  # Create a blank canvas

    # Paste images onto the canvas
    x_offset = 0
    for im in images:
        new_image.paste(im, (x_offset, 0))
        x_offset += im.size[0]

    # Generate the concatenated file name
    base_names = os.path.basename(image_paths[0])
    concatenated_path = os.path.join("", f"/workspace/LlaVA_img_train/concatenated_{base_names}")

    # Check if a file with the same name exists
    if os.path.exists(concatenated_path):
        mew.append(concatenated_path)
        # If the file exists, return its name
        return f"File already exists: {concatenated_path}"
    else:
        # Save the new image
        new_image.save(concatenated_path)
        mew.append(concatenated_path)
        return concatenated_path

# Wrap the main loop with tqdm to show progress
for entry in tqdm(data, desc="Processing Entries"):
    conversation_parts = [f"Topic: {entry['topic']}"]
    all_images = []

    # Collect all images from the conversation
    for i in range(1, 5):  # Assuming a maximum of 4 turns
        img_ids_key = f"img_ids{i}"
        img_ids = entry.get(img_ids_key, [])
        downloaded_images = download_and_resize_images(img_ids)
        all_images.extend(downloaded_images)

    # Concatenate all images into one
    if all_images:
        concatenated_image = concatenate_images(all_images, mew)
      
        all_images = [concatenated_image]

    for i in range(1, 5):
        question_key = f"question{i}"
        answer_key = f"answer{i}"
        
        if question_key in entry and answer_key in entry:
            conversation_parts.append(f"GPT: {entry[question_key]}")
            if i == 1 and all_images:
                conversation_parts.append('<image>')
            conversation_parts.append(f"USER: {entry[answer_key]}")
        else:
            break

    # Construct the full conversation
    conversation = ' '.join(conversation_parts)

    # Prepare the documents
    documents = entry.get("related_dict", [])
    
    # Create the new JSONL entry
    new_entry = {
        "facet_id": entry.get("facet_id", ""),
        "images": all_images,
        "conversation": conversation,
        "documents": documents
    }
    
    # Append the JSONL line
    output_lines.append(json.dumps(new_entry))

print(len(mew))

# Write to the JSONL file
with open('./newer/Full_turn/train_dataset_LLaVA_full_turn.jsonl', 'w') as f:
    for line in tqdm(output_lines, desc="Writing to JSONL"):
        f.write(line + '\n')

Processing Entries:   0%|          | 0/1345 [00:00<?, ?it/s]


Processing Entries: 100%|██████████| 1345/1345 [00:38<00:00, 35.00it/s]


1345


FileNotFoundError: [Errno 2] No such file or directory: './newer/Full_turn/train_dataset_LLaVA_full_turn.jsonl'

In [None]:
import json

# Load the JSON dataset
with open("./First_turn/test_dataset_bert_first_turn.json", "r") as json_file:
    dataset = json.load(json_file)

# Load the qrels file
qrels = []
with open("../../Bert_datas/qrels", "r") as qrels_file:
    for line in qrels_file:
        parts = line.strip().split()
        if len(parts) == 4:
            facet_id, _, clueweb_id, relevance = parts
            qrels.append({
                "facet_id": facet_id,
                "clueweb_id": clueweb_id,
                "relevance": int(relevance)
            })

# Process the dataset
for entry in dataset:
    # Extract the facet_id and remove the "F" prefix for matching
    facet_id = entry["facet_id"].lstrip("F")
    
    # Filter qrels for this facet_id with relevance > 0
    relevant_clueweb_ids = [
        qrel["clueweb_id"] for qrel in qrels
        if qrel["facet_id"] == facet_id and qrel["relevance"] > 0
    ]
    
    # Update the related_dict in the dataset entry
    entry["related_dict"] = relevant_clueweb_ids

# Save the updated dataset back to a JSON file
with open("./First_turn/test_dataset_bert_first_turn.json", "w") as output_file:
    json.dump(dataset, output_file, indent=4)

print("Updated dataset saved to 'test_dataset_bert_singleturn.json'")