In [17]:
import os
import json
import numpy as np
from tqdm import tqdm

In [21]:
def convert_mmc4_shard(jsonl_filepath, embed_filepath, shard_index, dataset):
    # Pre-allocate np array
    embed_dict = np.load(embed_filepath, allow_pickle=True)
    num_rows = len(embed_dict)
    num_cols = embed_dict[list(embed_dict.keys())[0]].shape[0]
    embeddings = np.zeros((num_rows, num_cols))

    # Create map from image id to index
    image_id_to_index = {}

    # Load jsonl file
    jsonl_list = []
    with open(jsonl_filepath, 'r') as f:
        # Iterate over jsonl file
        for idx, line in tqdm(enumerate(f.readlines())):
            data = json.loads(line)

            text_list = data['text_list']

            # Get image id
            for image in data['image_info']:
                image_name = image['image_name']

                # Get image index if image id in map
                image_index = image_id_to_index.get(image_name, None)

                # If image id not in map, add to map and add to embeddings
                if image_index is None:
                    image_index = len(image_id_to_index)
                    image_id_to_index[image_name] = image_index
                    embeddings[len(image_id_to_index) - 1] = embed_dict[image_name]

                text_list.insert(image['matched_index'], f"<image><<{shard_index},{image_index},{dataset},image>></image>")

            text = " ".join(text_list)

            # Add to jsonl list
            data['text'] = text
            jsonl_list.append(data)

    # Return new jsonl file and embeddings array
    return jsonl_list, embeddings

In [22]:
mmc4_data_dir ='/path/to/mmc4/'
jsonl_filepath = os.path.join(mmc4_data_dir, 'docs_no_face_shard_0_v2.jsonl')
embed_filepath = os.path.join(mmc4_data_dir, "clip_vitl14_shard_0_features.pkl")

In [26]:
new_jsonl_list, embed = convert_mmc4_shard(jsonl_filepath, embed_filepath, 0, 'mmc4')

3001it [00:00, 14339.46it/s]


In [32]:
original_jsonl = open(jsonl_filepath, 'r').readlines()
original_embed = np.load(embed_filepath, allow_pickle=True)
assert(len(new_jsonl_list) == len(original_jsonl))
assert(embed.shape[0] == len(original_embed.keys()))

In [34]:
new_jsonl_list[10]['text']

'Fun with soon-to-be famous sheep! It started with an unusual email asking if we were free on Wednesday evening to help herd some sheep. OK. Where from, to and most of all, Why? Apparently actress Liz Hurley will be filming scenes for a new TV show "The Royals" in our village this week. The show which will air on E! sometime next year is about a fictional Royal family. Liz plays the Queen - but apparently a boozy table dancing monarch which is in no way based on our Queen! So where do the sheep come into this? Hmm not entirely sure but the location people had scouted out my friend\'s small field and booked it for filming along with the sheep in it. In between scouting and filming, the sheep had been moved and this week we needed to move them back. I replied to the email - yes of course we would love to help. Then wondered what I had let myself in for. I wondered what one should wear for sheep herding -shepherding rather - but in the absence of a Bo Peep costume decided it didn\'t reall