In [1]:
import json 
from collections import OrderedDict
import glob
import webdataset as wds
from torch.utils.data import DataLoader
import pprint
import os
from tqdm import tqdm
import pandas as pd

pd.set_option('display.max_rows', 100)

In [2]:
def read_json(file):
    with open(file, 'r') as f:
        # data = json.load(f)
        data = json.load(f, object_pairs_hook=OrderedDict)
        
    return data 

def read_text_lines(file):
    lines = []
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            line = line.replace('"', '').replace("'", "")
            if line:
                lines.append(line)
    
    return lines



In [3]:
def load_sample_simple(sample):
    
    print("sample:", sample)
    sample_key = sample['__key__']
    
    image_hash = hashlib.md5(sample_key.encode('utf-8')).hexdigest()
    print("sample_key:", sample_key, image_hash)
    
    data = {}
    json_data = json.loads(sample["json"])
    # print("json_data:", json_data)
    
    

    
    for k, v in json_data.items():
        if v is not None:
            data[k] = v 
            
    image_keys = ['jpg', 'png', 'jpeg', 'bmp', 'tif', 'tiff']
    for k in image_keys:
        if k in sample:
            data["image_key"] = k 
            break 
    else:
        data["image_key"] = None
                    
    return data 


def load_sample_instruction_tune(sample):
    # copy from https://fburl.com/code/hnm16mp7
    
    # print(sample)
    
    data = load_sample_simple(sample)
    
    result_dict = {
        "image_key": data["image_key"]
    }

    result_dict["text"] = data.get("text", "")
    
    default_question_if_none = "What am I looking at?"
    result_dict.update(
        {
            "question": data.get("question", default_question_if_none),
            "response": data.get("response", ""),
            "instruction": data.get("instruction", ""),
        }
    )

    if "response" not in data and "text" in data:
        result_dict["response"] = data["text"]
    
    return result_dict


def scan_wd_path(wd_path: str):
    search_keys = ['text', 'caption', 'instruction', 'question', 'response']
    instruction_tune_keys = ['instruction', 'question', 'response']
    output = {
        "train_urls": 0,
        "iterable": True,
        "instruction_tune_compatible": None
    }
    for k in search_keys:
        output[k] = ''

    # Count number of tar files
    search_path = os.path.join(wd_path, "**", "*.tar")
    train_urls = glob.glob(search_path, recursive=True)
    output["train_urls"] = len(train_urls)
    
    # if len(train_urls) > 0:
    #     # Check for keys in one sample 
    #     dataset = wds.DataPipeline(
    #         wds.SimpleShardList(train_urls),
    #         wds.tariterators.tarfile_to_samples(),
    #         wds.map(load_sample_simple)
    #     )
    #     dataloader = DataLoader(dataset, batch_size=1, num_workers=0)
        
    #     one_sample = next(iter(dataloader))
        
    #     for k in one_sample.keys():
    #         if k in search_keys:
    #             output[k] = "YES"
    #     # print(one_sample)
    #     # print(output)
    #     output["image_ext"] = one_sample["image_key"][0]
       
    if len(train_urls) > 0:     
        try:
            # Check for keys in one sample 
            dataset = wds.DataPipeline(
                wds.SimpleShardList(train_urls),
                wds.tariterators.tarfile_to_samples(),
                wds.map(load_sample_simple)
                # wds.map(load_sample_instruction_tune)
            )
            dataloader = DataLoader(dataset, batch_size=1, num_workers=0)
            
            one_sample = next(iter(dataloader))
            print("one_sample", one_sample)
            for k in one_sample.keys():
                if k in search_keys:
                    # output[k] = "YES"
                    output[k] = one_sample[k][0]
            print(one_sample)
            print(output)
            output["image_ext"] = one_sample["image_key"][0]

        except Exception as e:
            print(e)
            output["iterable"] = False
            
        # check for instruction_tune
        try:
            # Check for keys in one sample 
            dataset = wds.DataPipeline(
                wds.SimpleShardList(train_urls),
                wds.tariterators.tarfile_to_samples(),
                # wds.map(load_sample_simple)
                wds.map(load_sample_instruction_tune)
            )
            dataloader = DataLoader(dataset, batch_size=1, num_workers=0)
            one_sample = next(iter(dataloader))
            
            eligible = True
            for k in instruction_tune_keys:
                if k in one_sample.keys():
                    continue
                else:
                    eligible = False
                    break
                    
            output["instruction_tune_compatible"] = eligible

        except Exception as e:
            print(e)
            output["iterable"] = False
    else:
        output["image_ext"] = None
        output["iterable"] = False
    
    output["wd_path"] = wd_path
    
    return output


In [100]:
# wd_path = '/fsx_1/data_curation_output/cyprien/output/caption_llama11b_fair_m2c2_clip_filtered_0.37985'
# wd_path = '/fsx_1/datasets_30days/sg_production_data_test_n_training/training_wds/20240722_20241018/dense_captions'
wd_path = '/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/sg_pdf_summarization_unicode'
# wd_path = '/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/tripple_ocr_data_v1_update'
scan_wd_path(wd_path)

sample: {'__key__': '0057_0000047400000', '__url__': '/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/sg_pdf_summarization_unicode/sg_pdf_summarization_unicode_id_0057_00000474.tar', 'json': b'{\n    "everstore_handle": null,\n    "image_manifold_path": "manifold://sg_scene_ai/tree/data/OCR_pdf/yymm0204_dpi200_rank19/images/yymm0204_rank19_paperID0086_chunkID0001.png",\n    "instruction": "Summarize the text in the image in 1-2 bullet points.",\n    "question": null,\n    "response": "\\u2022 Alice\'s most general strategy is to prepare a state | \\u03c8\'\\u27e9=\\u2211_i=1^k \\u03bc_i | \\u03c6\'_i\\u27e9 | \\u03d5_i\\u27e9, which can be split into two parts | \\u03c8\'_0\\u27e9 and | \\u03c8\'_1\\u27e9 after Bob\'s measurement.\\n\\u2022 Bob tests the state | \\u03c8\'_1\\u27e9 against the state | \\u03c8\\u0303_\\u03031\\u0303\\u27e9, which would have resulted if Alice had prepared the honest state | \\u03c8\\u0303\\u27e9. The in

{'train_urls': 1470,
 'iterable': True,
 'instruction_tune_compatible': True,
 'text': '',
 'caption': '',
 'instruction': 'Summarize the text in the image in 1-2 bullet points.',
 'question': '',
 'response': "• Alice's most general strategy is to prepare a state | ψ'⟩=∑_i=1^k μ_i | φ'_i⟩ | ϕ_i⟩, which can be split into two parts | ψ'_0⟩ and | ψ'_1⟩ after Bob's measurement.\n• Bob tests the state | ψ'_1⟩ against the state | ψ̃_̃1̃⟩, which would have resulted if Alice had prepared the honest state | ψ̃⟩. The inner product between | ψ'_1⟩ and | ψ_1⟩ is maximized if | φ'_i⟩=| i⟩ for all i, and sending | ψ”⟩=∑_i=1^k μ_i | i⟩ | ϕ_i⟩ instead of | ψ'⟩ only increases Alice's success probability.",
 'image_ext': 'png',
 'wd_path': '/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/sg_pdf_summarization_unicode'}

In [14]:
wd_paths = read_text_lines("/fsx_0/user/tranx/experiments/aligner/recipe/mm10.1_stage2_easy_med.txt")

results = []
# wd_paths = ["/fsx_3/dataset01/cauldron/tallyqa"]
for wd_path in tqdm(wd_paths):
    print(wd_path)
    output = scan_wd_path(wd_path)
    results.append(output)
    
df = pd.DataFrame(data=results)
df

  0%|          | 0/80 [00:00<?, ?it/s]

/fsx_1/data_curation_output/cyprien/output/wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985/all/


  8%|▊         | 6/80 [00:00<00:04, 16.68it/s]

{'image': ['/fsx_1/data_curation_output/cyprien/output/wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985/all/0096/0096_00000169.tar/0096_0000016900000.jpeg'], 'image_manifold_path': ['/fsx_1/data_curation_output/cyprien/output/wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985/all/0096/0096_00000169.tar/0096_0000016900000.jpeg'], 'question': ['Write a detailed caption for this image.'], 'synthetic_detailed_caption': ['The image shows the cover of a book titled "4,5-Dihydro and Disubstituted Imidazolidin Derivatives" by Rafet Kilincarslan, with a serene ocean scene at the top featuring a bright sun and sea. The image also features a logo for Lambert Academic Publishing and a red stripe at the bottom.'], 'text': ['The image shows the cover of a book titled "4,5-Dihydro and Disubstituted Imidazolidin Derivatives" by Rafet Kilincarslan, with a serene ocean scene at the top featuring a bright sun and sea. The image also features a logo for Lambert Academic Publishing and a red stripe a

 16%|█▋        | 13/80 [00:00<00:03, 21.87it/s]

{'image_manifold_path': ['manifold://sg_scene_ai/tree/data/arxiv/2109/2109.01753v1/06.png'], 'response': ['additional features related to learning context, interaction times and hint usage provided by the ASSISTment 2009 [Feng et al. (2009)] and Junyi15 [Chang et al. (2015)] datasets. While that work employs most information contained in the Junyi15 dataset, it fails to utilize the prerequisite structure among topics in the curriculum, and does not evaluate the potential benefit of those features for logistic regression models. EKT [Liu et al. (2019)] uses the question text to learn exercise embeddings which are then used for downstream performance predictions. MVKM [Zhao et al. (2020)] uses a multi-view tensor factorization to model knowledge acquisition from different types of learning materials (quizzes, videos, ...). SAINT+ [Shin et al. (2021)] and MUSE [Zhang et al. (2021)] augment transformer models with interaction time features to capture short-term memorization and forgetting.

 20%|██        | 16/80 [00:00<00:03, 16.90it/s]

{'ds': ['2024-09-09'], 'keep': tensor([True]), 'shard_id': tensor([84]), 'everstore_handle': ['GAe3PBCvWiurBbcCAEkCkeBe9ykqbqZyAAAB'], 'instruction': ['Answer the question using a single word or a continuous phrase from the image.\nThe phase can span multiple lines.\nThe phrase is less than 15 words.'], 'question': ["How much did the world's largest cheese weigh?"], 'response': ['34,591 lbs (15,723 kg)'], 'source_table': ['mmai_ig_human_texts_curated_v1_0909/ds=2024-09-09'], 'image_key': ['png'], '__key__': ['0083_0000000000000']}
{'train_urls': 100, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': 'Answer the question using a single word or a continuous phrase from the image.\nThe phase can span multiple lines.\nThe phrase is less than 15 words.', 'question': "How much did the world's largest cheese weigh?", 'response': '34,591 lbs (15,723 kg)'}
/fsx_0/datasets_30days/flywheel/sg_mmllm_sft_sg_data_curation_flywheel_1p3k_0917
{'ds': ['202

 24%|██▍       | 19/80 [00:01<00:05, 10.23it/s]

{'ds': ['2024-09-29'], 'keep': tensor([True]), 'shard_id': tensor([26]), 'everstore_handle': ['GA3BgBvVn2qrjRwEAI9KbyU4KqoNbt4aAQAD'], 'instruction': ['Answer the question using a single word or a continuous phrase from the image. The phase can span multiple lines. The phrase is less than 15 words.'], 'question': ['What is the Total deferred revenue and deposits for June 30, 2018?'], 'response': ['$91 million'], 'source_table': ['syn_img_human_txt_3k_v1_instruction_0927/ds=2024-09-27'], 'image_key': ['jpg'], '__key__': ['0026_0000000000000']}
{'train_urls': 100, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': 'Answer the question using a single word or a continuous phrase from the image. The phase can span multiple lines. The phrase is less than 15 words.', 'question': 'What is the Total deferred revenue and deposits for June 30, 2018?', 'response': '$91 million'}
/fsx_0/datasets_30days/flywheel/sg_mmllm_sft_syn_img_flywheel_2k_rotated_0

 26%|██▋       | 21/80 [00:01<00:07,  8.37it/s]

{'ds': ['2024-10-10'], 'index': tensor([69]), 'everstore_handle': ['GA6ZpQDW21nCwuACAIBqsn4AAAAAbkULAAAB'], 'question': ['What is the total amount due on this receipt?'], 'response': ['$19.60'], 'instruction': ['Answer the question using a single word or a continuous phrase from the image. The phase can span multiple lines. The phrase is less than 15 words.'], 'image_key': ['jpg'], '__key__': ['0069_0000000000000']}
{'train_urls': 100, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': 'Answer the question using a single word or a continuous phrase from the image. The phase can span multiple lines. The phrase is less than 15 words.', 'question': 'What is the total amount due on this receipt?', 'response': '$19.60'}
/fsx_0/datasets_30days/flywheel/data_flywheel_5k_ig_no_val_4p7k_rotated_1011
{'ds': ['2024-10-11'], 'index': tensor([120]), 'image': ['GCZR0ACCrfRPwRsCALoNijQAAAAAbkULAAAB'], 'updated_path': ['manifold://sg_scene_ai/tree/docvqa_s

 30%|███       | 24/80 [00:02<00:05,  9.71it/s]

{'everstore_handle': ['FksuDgA2Z-C8Rd8AAFj3UzJuRQsAAAE:'], 'annotation_order': tensor([3]), 'response': ['23.1'], 'question': ['What is the percentage of families with an average yearly income of 10-14,999?'], 'image_key': ['jpg'], '__key__': ['0003_0000000000000']}
{'train_urls': 6, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': '', 'question': 'What is the percentage of families with an average yearly income of 10-14,999?', 'response': '23.1'}
/fsx_0/datasets_30days/data_flywheel_5k_ig_prod_batch_2_1015/202401016
{'everstore_handle': ['GJwlNRDKnDFPQMUAAK0pN7bmcS9xbqZyAAAB'], 'annotation_order': tensor([1]), 'question': ['What is the code at the top right side of the page?'], 'response': ['090741'], 'image_key': ['jpg'], '__key__': ['0001_0000000000000']}
{'train_urls': 6, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': '', 'question': 'What is the code at the top right side of the page?

 48%|████▊     | 38/80 [00:02<00:01, 26.94it/s]

{'image_manifold_path': ['manifold://sg_scene_ai/tree/llm_mm_aligner/coco/train2017/000000289591.jpg'], 'instruction': ['\nAnswer with the options letter from the given choices directly.\n'], 'question': ['Question: What state is this city in?\nA: illinois\nB: new zealand\nC: new york\nD: ohio'], 'response': ['\nAnswer: D'], 'image_key': ['png'], '__key__': ['0009_0000008300000']}
{'train_urls': 114, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': '\nAnswer with the options letter from the given choices directly.\n', 'question': 'Question: What state is this city in?\nA: illinois\nB: new zealand\nC: new york\nD: ohio', 'response': '\nAnswer: D'}
/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/mmllm_sft_clevr_math_train_diverse_instr
{'image_manifold_path': ['manifold://sg_scene_ai/tree/llm_mm_aligner/datasets/cler-math/CLEVR_v1.0/images/train/CLEVR_train_055653.png'], 'instruction': ['

 60%|██████    | 48/80 [00:02<00:00, 37.05it/s]

{'image_manifold_path': ['manifold://sg_scene_ai/tree/llm_mm_aligner/datasets/figureqa/images/67714.png'], 'instruction': ['\nAnswer the question using either Yes or No.'], 'question': ['Does Deep Sky Blue have the maximum area under the curve?'], 'response': ['Yes'], 'image_key': ['png'], '__key__': ['0044_0000208800000']}
{'train_urls': 4425, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': '\nAnswer the question using either Yes or No.', 'question': 'Does Deep Sky Blue have the maximum area under the curve?', 'response': 'Yes'}
/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/s2_ott_train_with_context_and_diversified_instr
{'image_manifold_path': ['manifold://sg_scene_ai/tree/llm_mm_aligner/datasets/OTT_QA/table_images_long_text/Keke_Palmer_1.png'], 'instruction': ["\nIdentify the solution to the question posed in the table. Lauren Keyana Keke Palmer (born August 26, 1993) is an Ameri

 76%|███████▋  | 61/80 [00:02<00:00, 44.86it/s]

{'dataset': ['train'], 'image': ['/fsx_1/datasets_30days/sg_production_data_test_n_training/wds/20240722_20241018/0033/0033_00000003/0033_0000000300331.jpg'], 'caption': ['A variety of shower poufs in different colors are displayed on a rack.'], 'image_key': ['jpg'], '__key__': ['0159_1594_0000']}
{'train_urls': 1648, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': 'A variety of shower poufs in different colors are displayed on a rack.', 'instruction': '', 'question': '', 'response': ''}
/fsx_1/datasets_30days/sg_production_data_test_n_training/training_wds/20240722_20241018/dense_captions
{'dataset': ['dev'], 'image': ['/fsx_1/datasets_30days/sg_production_data_test_n_training/wds/20240722_20241018/0142/0142_00000004/0142_0000000400370.jpg'], 'caption': ["The image shows a person's hand holding a smartphone with a black screen displaying text in Spanish. The phone appears to be turned on, and the text is visible on the screen. The person is wearing a blue

100%|██████████| 80/80 [00:03<00:00, 26.37it/s]

{'dataset': ['dev'], 'image': ['/fsx_1/datasets_30days/sg_production_data_test_n_training/wds/20240722_20241018/0090/0090_00000005/0090_0000000500443.jpg'], 'question': ['What type of plant is this?'], 'response': ['It appears to be a Calathea plant.'], 'instruction': [''], 'image_key': ['jpg'], '__key__': ['0049_2469_0000']}
{'train_urls': 8238, 'iterable': True, 'instruction_tune_compatible': None, 'text': '', 'caption': '', 'instruction': '', 'question': 'What type of plant is this?', 'response': 'It appears to be a Calathea plant.'}
/fsx_3/bucket/cyprien/data/wds_plants_wiki_qa1_p
{'everstore_handle': ['GPh1Ywecc0dpaMEAAPEqKcKz2gFjbihaAAAB'], 'instruction': ['Answer the question using a single word or a continuous phrase from the image.\nThe phrase can span multiple lines.'], 'question': ['What is the name of this plant?'], 'response': ['Gardenia gummifera'], 'caption': ['Gardenia gummifera is a species of plant in the family Rubiaceae. It is endemic to India.'], 'source_table': ['




Unnamed: 0,train_urls,iterable,instruction_tune_compatible,text,caption,instruction,question,response,image_ext,wd_path
0,30104,True,True,"The image shows the cover of a book titled ""4,...",,"You are my helpful, respectful and honest voic...",Write a detailed caption for this image.,"The image shows the cover of a book titled ""4,...",jpeg,/fsx_1/data_curation_output/cyprien/output/wds...
1,3000,True,True,BB10001 Fraum: - whose fiancee Sandra- Edge le...,,,,,jpg,/fsx_3/dataset01/idl-wds_v2
2,145,True,True,Get Free Tasting Beer An Insiders Guide To The...,,Answer the question using a single word or a c...,What is the title of the document?,Tasting Beer An Insiders Guide To The Worlds G...,png,/fsx_0/user/qyh/data/idl_qa_filtered_tar_only/
3,1800,True,True,TENDER BSGEE 2021-008 INSURANCES OF THE EUROPE...,,,,,png,/fsx_3/dataset01/pdfa-eng-wds-converted
4,108,True,True,,,\nAnswer the question using a single word or p...,Who is the author of this book?,Zondervan,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
5,4603,True,True,,,,,\begin{remark} It is possible to define Weyl g...,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
6,4447,True,True,,,,,becomes predominant for negative \(\lambda_{a}...,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
7,13337,True,True,,,,,additional features related to learning contex...,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
8,11637,True,True,,,,,The limited-knowledge adversary Charlie eavesd...,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
9,7752,True,True,,,\nAnswer the question using a single word.,What is the value of role?,5,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...


In [15]:
df.to_csv("/fsx_0/user/tranx/experiments/aligner/recipe/mm10.1_stage2_easy_med_verification.csv", index=False)

In [93]:
df.head()

Unnamed: 0,train_urls,iterable,text,caption,instruction,question,response,image_ext,wd_path
0,30104,True,YES,,YES,YES,YES,jpeg,/fsx_1/data_curation_output/cyprien/output/wds...
1,3000,True,YES,,YES,YES,YES,jpg,/fsx_3/dataset01/idl-wds_v2
2,1800,True,YES,,YES,YES,YES,png,/fsx_3/dataset01/pdfa-eng-wds-converted
3,108,True,YES,,YES,YES,YES,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...
4,4603,True,YES,,YES,YES,YES,png,/fsx_1/datasets_30days/datarecipe_source/sg_mm...


# Create wd_recipe

In [9]:
# file = "/fsx_0/user/tranx/rsync/llm_mm_aligner/experiments/aws/mm10.1/stage2/recipes/stage2_cutoff_20241030.csv"
file = "/fsx_0/user/tranx/experiments/aligner/recipe/mm10.1_stage2_r2a.csv"
df = pd.read_csv(file, delimiter='\t')
df.head(2)

Unnamed: 0,train_urls,iterable,instruction_tune_compatible,text,caption,instruction,question,response,image_ext,wd_path,vol,multiplier,total (M)
0,30104,True,True,YES,,YES,YES,YES,jpeg,/fsx_1/data_curation_output/cyprien/output/wds...,30000000,1,30.0
1,3000,True,True,YES,,,,,jpg,/fsx_3/dataset01/idl-wds_v2,20000000,1,20.0


In [10]:
pp = pprint.PrettyPrinter(indent=4)

def get_name_from_path(path):
    words = path.split("/")
    
    # try first method to get the meaninful name
    for w in words[::-1]:
        if "_" in w or "-" in w:
            return w 
    
    # try any part of the path that is a word
    for w in words[::-1]:
        if len(w) > 0:
            return w 
        
    raise ValueError(f"Unable to get_name_from_path: {path}")
    
recipe = {}
DEFAULT_QUESTION = "What am I looking at?"

for i, row in df.iterrows():

    name = get_name_from_path(row.wd_path)
    r = {
        "paths": [row.wd_path],
        "multipler": row.multiplier
    }
    
    # Handle missing 'response'
    if row.response != "YES":
        if row.text == "YES":
            r["rename_map"] = {"text": "response"}
        elif row.caption == "YES":
            r["rename_map"] = {"caption": "response"}
        else:
            raise ValueError(f"Unable to find a suitable field for 'response' in {row.wd_path}")
        
    # # Handle missing 'question'. If missing question, instruction is always missing also
    # if row.question != "YES":
    #     r["new_keys"] = {
    #         "question": "What am I looking at?",
    #         "instruction": "Look at the image and answer the following question."
    #     }
    # # Handle having 'question' but not 'instruction'
    # elif row.instruction != "YES":
    #     r["new_keys"] = {
    #         "instruction": "Look at the image and answer the following question."
    #     }
           
    recipe[name] = r 
    
    print("-"*20)
    print(row.wd_path)
    print(name)
    pp.pprint(r)
        
    
# pp.pprint(recipe)

--------------------
/fsx_1/data_curation_output/cyprien/output/wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985/all/
wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985
{   'multipler': 1,
    'paths': [   '/fsx_1/data_curation_output/cyprien/output/wds_caption_llama11b_fair_m2c2_clip_filtered_0.37985/all/']}
--------------------
/fsx_3/dataset01/idl-wds_v2
idl-wds_v2
{   'multipler': 1,
    'paths': ['/fsx_3/dataset01/idl-wds_v2'],
    'rename_map': {'text': 'response'}}
--------------------
/fsx_0/user/qyh/data/idl_qa_filtered_tar_only/
idl_qa_filtered_tar_only
{'multipler': 10, 'paths': ['/fsx_0/user/qyh/data/idl_qa_filtered_tar_only/']}
--------------------
/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/mmllm_sft_ocr_vqa
mmllm_sft_ocr_vqa
{   'multipler': 40,
    'paths': [   '/fsx_1/datasets_30days/datarecipe_source/sg_mmllm_stage2_compliant_cap_qa_exp28_kosher_v2/20240827v4/mmllm_sft_ocr_vqa']}
--------------------
/fsx_

In [11]:
with open("/fsx_0/user/tranx/experiments/aligner/recipe/mm10.1_stage2_r2a.json", "w") as f:
    json.dump(recipe, f, indent=4)

In [None]:
instructions = [
    "Look at the image and answer the following question.",
    "Examine the image carefully and respond to the question.",
    "Observe the picture and provide an answer to the question.",
    "Analyze the image and answer the question that follows.",
    "Study the photograph and respond to the inquiry.",
    "Look closely at the image and answer the subsequent question.",
    "Review the picture and provide an answer to the question.",
    "Inspect the image and answer the question that is posed.",
    "Consider the image and respond to the question provided.",
    "View the image and answer the question that follows.",
    "Focus on the image and provide an answer to the question.",
    "Pay attention to the details in the image and answer the question.",
    "Scrutinize the image and provide a thoughtful response to the question.",
    "Evaluate the elements in the picture and answer the question.",
    "Interpret the image and respond to the question accordingly.",
    "Look at the image from different angles and answer the question.",
    "Contemplate the scene depicted in the image and provide an answer.",
    "Reflect on the image and answer the question that is asked.",
    "Delve into the image and respond to the question with insight.",
    "Survey the image and provide a comprehensive answer to the question.",
    "Glance over the image and answer the question with clarity."
]

questions = [
    "What am I observing here?",
    "What is this image showing me?",
    "What do I see in this picture?",
    "What is depicted in this image?",
    "What is presented in this photograph?",
    "What is visible in this scene?",
    "What am I viewing in this image?",
    "What is captured in this picture?",
    "What is illustrated in this image?",
    "What am I witnessing in this photo?",
    "What is this image illustrating?",
    "What is portrayed in this picture?",
    "What am I seeing in this image?",
    "What is this scene showing?",
    "What is featured in this photograph?",
    "What is this picture displaying?",
    "What am I gazing at in this image?",
    "What is this image capturing?",
    "What is this photo depicting?",
    "What am I examining in this picture?"
]

In [48]:
import hashlib

In [76]:
im.to_string()

AttributeError: 'bytes' object has no attribute 'to_string'

In [52]:
h = hashlib.md5(("hahae").encode('utf-8')).hexdigest()
h

'8cb6a8f3f0059f2ea4c940608d6efb0b'

In [54]:
import random

In [53]:
hash_string = '8cb6a8f3f0059f2ea4c940608d6efb0b'
hash_integer = int(hash_string, 16)
print(hash_integer)

187040344209681663984706783120584538891


In [70]:
data = {
    
    "name": 1,
    # "response": 3,
    # "text": 2
}

x = data.get("response", data.get("text", 4))
print(x)

4


In [62]:
r = random.Random(hash_integer)
r.choice([1,2,3])

2

In [71]:
x = {
    "text": "Friends of Sturt Gorge Friends of Sturt Gorge Annual Membership Renewal Membership Renewal Dear Member Name 1: \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026 Your annual membership renewal is due as at 1st January and remains at $10.00 per member or family living at the same address. \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026 Name 2 : Address: \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026 The preferred method of payment is by direct or electronic debit to our bank account as detailed below. Ensure your name is quoted in the \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026. Postcode\u2026\u2026\u2026.. reference panel so that it will appear on our bank statement, and also Email: \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026 request your bank to advise us by email of the transaction (if this service is available). Telephone: (day) \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026.. (evening) \u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026.. Bank Account details: Name: Friends of Sturt Gorge BSB No: 105-078 Membership Fees: 1yr 2yrs 3yrs Account No. 054419440 Reference YOUR NAME o Single & Family $10.00 $20.00 $30.00 bobgrant@adam.com.au e mail o Donation $\u2026\u2026\u2026\u2026\u2026\u2026............... You may also pay in person at any meeting or working bee. If remitting by the postal service please do not send cash. Have your Total Enclosed $\u2026\u2026\u2026\u2026.......................... cheque made payable to Friends of Sturt Gorge and post to \u201cThe Treasurer c/o 14 Gorge Road, Bellevue Heights 5050\u201d Date:................................. If remitting by the postal service OR if you have changed any of your details it would be helpful if you could complete the attached Send to: renewal notice and e-mail it to bobgrant@adam.com.au or hand it to the treasurer or send it to the above address with your remittance. A receipt The Treasurer will be issued and an acknowledgement sent by e-mail where Friends of Sturt Gorge appropriate. c/o 14 Gorge Road Bellevue Heights SA 5050 Yours sincerely Bob Grant Payment may be made by electronic or direct credit through your Membership Officer bank. Phone: 7329 8296 Please show your name on the Direct Credit Entry. e-mail: bobgrant@adam.com.au Bank Account details: Name: Friends of Sturt Gorge BSB No: 105-078 Account No. 054419440 Reference YOUR NAME"
}

In [19]:
import ast
s = "{1:2, 3:4}"
sd = ast.literal_eval(s)
isinstance(s, str), isinstance(sd, dict), isinstance(sd, str)

(True, True, False)

In [20]:
sd

{1: 2, 3: 4}