## Prepare dependencies

In [None]:
import sagemaker
sess = sagemaker.Session()
sagemaker_default_bucket = sess.default_bucket()

s3_destination_path = f's3://{sagemaker_default_bucket}/blip-flickr/' # use default bucket or change to your own s3 path

print(s3_destination_path)

In [None]:
# Download s5cmd for fast download
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz s5cmd

#### Download checkpoints

In [None]:
!wget -c https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth

In [None]:
s3_ckpt = s3_destination_path + 'model/'
!aws s3 cp model_base_retrieval_flickr.pth {s3_ckpt}

#### Download annotations

In [None]:
!wget https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
!wget https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
!wget https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json

In [None]:
# Sample a sub dataset
import json, random

val_anno = json.load(open('flickr30k_val.json','r'))
sub_val = random.sample(val_anno,500)
with open('flickr30k_val_sub.json', 'w') as f: 
    json.dump(sub_val, f)
    
test_anno = json.load(open('flickr30k_test.json','r'))
sub_test = random.sample(test_anno,500)
with open('flickr30k_test_sub.json', 'w') as f:
    json.dump(sub_test, f)
    
train_anno = json.load(open('flickr30k_train.json','r'))
sub_train = random.sample(train_anno,2000)
with open('flickr30k_train_sub.json', 'w') as f: 
    json.dump(sub_train, f)

In [None]:
s3_anno = s3_destination_path + 'sub-anno/'
!aws s3 cp flickr30k_val_sub.json {s3_anno}
!aws s3 cp flickr30k_test_sub.json {s3_anno}
!aws s3 cp flickr30k_train_sub.json {s3_anno}

In [None]:
sub_anno = sub_val+sub_test+sub_train

#### Download images & Sample subsets

In [None]:
!pip install kaggle
!cp kaggle.json /home/ec2-user/.kaggle
!kaggle datasets download -d hsankesara/flickr-image-dataset
!unzip flickr-image-dataset.zip

In [None]:
# Image should be sampled based on annotation
import os
def copy_single_image(d):
    img = 'flickr30k-dataset/flickr30k_images/' + d['image']
    os.system(f'aws s3 cp {img} s3://llm-artifacts-us-east-1/blip-flickr/sub-images/')
    
from joblib import Parallel, delayed
Parallel(n_jobs=5)(delayed(copy_single_image)(i) for i in sub_anno)