### > Setup

In [1]:
import os
import sys
import json
import uuid
import shutil

from helper import (
    get_text_response,
    calc_total_cost,
    load_jsonl,
    random_split,
    download_file_from_s3,
    _encode
)
import sagemaker
import boto3


sess = sagemaker.Session()
bucket = sess.default_bucket() # Set a default S3 bucket

s3_prefix ="titan-finetuning/multi-modal-embedding"

# corpus data directory
directory_path = "data"

# Check if the directory exists
if os.path.exists(directory_path):
    shutil.rmtree(directory_path)

# Create the new directory
os.mkdir(directory_path)
print(f"Created new directory: {directory_path}")

# initialize S3 clinet
s3 = boto3.client('s3')

#Anthropic Calude 3 Sonnet pricing
llm_price = {'input_tokens': 0.003/1000, 'output_tokens': 0.015/1000}

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Created new directory: data


In [2]:
data_file = "../image-generator/image_data.jsonl"
data = load_jsonl(data_file)
train_corpus, valid_corpus = random_split(data)

### Generate synthetic queries

Now, we use Claude 3 from Amazon Bedrock to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, caption, and image) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [3]:
template="""
Your task is to generate relevant search queries based on this image. Your goal is to generate {QUERY_NUMBER} search
queries that a user might enter into a search engine to find images similar to the one described in
the caption.

To generate the search queries, carefully analyze the image. Identify the key
objects, scenes, actions, and concepts present in the description. Then, construct search queries
that capture these elements in a concise and relevant manner.

When formulating the search queries, consider the following:

- Generate relevant keywords and phrases from the image
- Vary the length and specificity of the queries (e.g., some queries can be broad, while others can
be more specific)
- Include queries that focus on different aspects of the image (objects, actions, scenes, etc.)
- Ensure that the queries are grammatically correct and make sense in the context of a search engine

Once you have generated the 20 search queries, format them as a JSON list, with each query as a
separate string element within the list.

Your output should look like this:

[
    "query 1",
    "query 2",
    ...
    "query 20"
]

Please note that you should only provide the JSON list of search queries. Do not include any additional text or explanations.
"""

In [4]:
from tqdm import tqdm

def generate_queries(corpus, template, number_of_qs=2):

    cost = 0
    queries = dict()
    mapping = dict()
    bedrock_data = []
    
    for node_id, data in tqdm(corpus.items()):
        image = download_file_from_s3(data["image-ref"])
        image_base64 = _encode(image)
        
        prompt = template.replace("{QUERY_NUMBER}", str(number_of_qs))
    
        try:
            model_response = get_text_response(image_base64=image_base64, text_query=prompt)
            questions = json.loads(model_response["content"][0]['text'])
            tokens = model_response['usage']
        except json.JSONDecodeError as e:
            raise "Unable to generate valid JSON, please try again..."
    
        for q in questions:
            q_id = str(uuid.uuid4())
            queries[q_id] = q
            mapping[q_id] = [node_id]
            bedrock_data.append({
                "image-ref":data["image-ref"],
                "caption": q
            })
            
        cost += calc_total_cost(llm_price, tokens)
    
    print(f"Estimated cost: ${cost:.2f}")
    return queries, mapping, bedrock_data

### > building training dataset

In [5]:
train_queries, train_maping, bedrock_train = generate_queries(train_corpus, template, number_of_qs=20)

100%|██████████| 64/64 [06:25<00:00,  6.03s/it]

Estimated cost: $0.52





In [6]:
valid_queries, valid_maping, bedrock_valid = generate_queries(valid_corpus, template, number_of_qs=20)

100%|██████████| 17/17 [01:47<00:00,  6.35s/it]

Estimated cost: $0.14





### > create the final training and validation dataset

In [7]:
train_data_path = './data/train.json'
valid_data_path = './data/valid.json'

In [8]:
train_dataset = {
    'queries': train_queries,
    'corpus': train_corpus,
    'relevant_docs': train_maping,
}

val_dataset = {
    'queries': valid_queries,
    'corpus': valid_corpus,
    'relevant_docs': valid_maping,
}

In [9]:
with open(train_data_path, 'w+') as f:
    json.dump(train_dataset, f)

with open(valid_data_path, 'w+') as f:
    json.dump(val_dataset, f)

### > Upload bedrock training data to S3

In [10]:
# Path to the JSONL file
jsonl_file_path = "data/training-embedding-data.jsonl"

try:
    with open(jsonl_file_path, "w", encoding="utf-8") as file:
        for item in bedrock_train:
            json_line = json.dumps(item, ensure_ascii=False)
            file.write(json_line + "\n")
    print(f"JSONL file '{jsonl_file_path}' created successfully.")
except Exception as e:
    print(f"Error creating JSONL file: {e}")

JSONL file 'data/training-embedding-data.jsonl' created successfully.


In [11]:
metadata_key = f"{s3_prefix}/metadata/{jsonl_file_path}"
# upload file to S3
s3.upload_file(jsonl_file_path, bucket, metadata_key)
print(f"Image data file '{jsonl_file_path}' uploaded to S3")

train_jsonl_path = f"s3://{bucket}/{metadata_key}"
%store train_jsonl_path

Image data file 'data/training-embedding-data.jsonl' uploaded to S3
Stored 'train_jsonl_path' (str)


### > Upload bedrock validation data to S3

In [12]:
# Path to the JSONL file
jsonl_file_path = "data/validation-embedding-data.jsonl"

try:
    with open(jsonl_file_path, "w", encoding="utf-8") as file:
        for item in bedrock_valid:
            json_line = json.dumps(item, ensure_ascii=False)
            file.write(json_line + "\n")
    print(f"JSONL file '{jsonl_file_path}' created successfully.")
except Exception as e:
    print(f"Error creating JSONL file: {e}")

JSONL file 'data/validation-embedding-data.jsonl' created successfully.


In [13]:
metadata_key = f"{s3_prefix}/metadata/{jsonl_file_path}"
# upload file to S3
s3.upload_file(jsonl_file_path, bucket, metadata_key)
print(f"Image data file '{jsonl_file_path}' uploaded to S3")

valid_jsonl_path = f"s3://{bucket}/{metadata_key}"
%store valid_jsonl_path

Image data file 'data/validation-embedding-data.jsonl' uploaded to S3
Stored 'valid_jsonl_path' (str)


In [14]:
%store train_data_path
%store valid_data_path

Stored 'train_data_path' (str)
Stored 'valid_data_path' (str)
