# Create Dataset

1. Install clip-server ans clip-client from `https://clip-as-service.jina.ai/`
2. Run clip-server on a terminal
3. Run code below

### Preprocess data

In [5]:
import json
from clip_client import Client
import os

data_dir = './'

┌────────── 🆕 New version available! ───────────┐
│ You are using clip_client 0.4.12, but 0.4.20   │
│ is available. You may upgrade it via pip       │
│ install -U clip_client. Read Changelog here.   │
└────────────────────────────────────────────────┘


In [8]:
# clean the data, check images' existance

dirInfo = os.walk(data_dir+'images/test')
test_filelist = []
for root, dirs, files in dirInfo:
    test_filelist = files

dirInfo = os.walk(data_dir+'images/val')
val_filelist = []
for root, dirs, files in dirInfo:
    val_filelist = files

dirInfo = os.walk(data_dir+'images/train')
train_filelist = []
for root, dirs, files in dirInfo:
    train_filelist = files

print(len(train_filelist), len(val_filelist), len(test_filelist))

32013 24160 15718


In [9]:
# create labels
label_dict = {}
label_dict_reverse = {}

test_annotations_json = json.load(open("annotations/test.json", "r"))["annotations"]
val_annotations_json = json.load(open("annotations/val.json", "r"))["annotations"]
train_annotations_json = json.load(open("annotations/train.json", "r"))["annotations"]
trainval_json = val_annotations_json+train_annotations_json

trainval_questionid2label = {}
test_questionid2label = {}
i = 0
for item in trainval_json:
    if(item["multiple_choice_answer"] not in label_dict.keys()):
        label_dict[item["multiple_choice_answer"]] = i
        label_dict_reverse[i] = item["multiple_choice_answer"]
        i+=1
    trainval_questionid2label[item["question_id"]] = label_dict[item["multiple_choice_answer"]]
for item in test_annotations_json:
    if(item["multiple_choice_answer"] not in label_dict.keys()):
        label_dict[item["multiple_choice_answer"]] = i
        label_dict_reverse[i] = item["multiple_choice_answer"]
        i+=1
    test_questionid2label[item["question_id"]] = label_dict[item["multiple_choice_answer"]]


In [10]:
# store labels
json.dump(label_dict, open(data_dir + "label2id_dict.json", "w"), indent=2)
json.dump(label_dict_reverse, open(data_dir + "id2label_dict.json", "w"), indent=2)

### Get embeddings

In [None]:
# get embeddings
test_question_json = json.load(open("questions/test.json"))["questions"]
val_question_json = json.load(open("questions/val.json"))["questions"]
train_question_json = json.load(open("questions/train.json"))["questions"]

label_dict = json.load(open(data_dir + "label2id_dict.json", "r"))
label_dict_reverse = json.load(open(data_dir + "id2label_dict.json", "r"))

c = Client('grpc://127.0.0.1:51000')
c.profile("")

In [12]:

test = {"questions": [], "images": [], "answer": []}
trainval = {"questions": [], "images": [], "answer": []}
#test
test_files = []
test_questions = []
test_rm_index = []
for index, item in enumerate(test_question_json):
    image_name = "COCO_val2014_{}.jpg".format(str(item["image_id"]).zfill(12))
    if(image_name not in test_filelist):
        test_rm_index.append(index)
        continue
    test["questions"].append(item["question"])
    test["images"].append(data_dir+"images/test/%s"%(image_name))
    test["answer"].append(test_questionid2label[int(item["question_id"])])
# trainval
trainval_files = []
trainval_questions = []
trainval_rm_index = []
for index, item in enumerate(val_question_json):
    image_name = "COCO_val2014_{}.jpg".format(str(item["image_id"]).zfill(12))
    if(image_name not in val_filelist):
        trainval_rm_index.append(index)
        continue
    trainval["questions"].append(item["question"])
    trainval["images"].append(data_dir+"images/val/%s"%(image_name))
    trainval["answer"].append(trainval_questionid2label[int(item["question_id"])])

for index, item in enumerate(train_question_json):
    image_name = "COCO_train2014_{}.jpg".format(str(item["image_id"]).zfill(12))
    if(image_name not in train_filelist):
        trainval_rm_index.append(index)
        continue
    trainval["questions"].append(item["question"])
    trainval["images"].append(data_dir+"images/train/%s"%(image_name))
    trainval["answer"].append(trainval_questionid2label[int(item["question_id"])])

print(len(test_rm_index), len(trainval_rm_index))


0 21304


In [None]:
# embedding, this will take a long time
test["ques_emb"] = c.encode(test["questions"], show_progress=True).tolist()
json.dump(test, open(data_dir+"test.json", "w"), indent=2)
print("finish 1")

test["img_emb"] = c.encode(test["images"], show_progress=True).tolist()
json.dump(test, open(data_dir+"test.json", "w"), indent=2)
print("finish 2")

trainval["ques_emb"] = c.encode(trainval["questions"], show_progress=True).tolist()
json.dump(trainval, open(data_dir+"trainval.json", "w"), indent=2)
print("finish 3")

trainval["img_emb"] = c.encode(trainval["images"], show_progress=True).tolist()
json.dump(trainval, open(data_dir+"trainval.json", "w"), indent=2)
print("finish 4")

print("finish all")

### Create Mindrecord

In [2]:
from mindspore.mindrecord import FileWriter
from mindspore import context
import json
import numpy as np

context.set_context(mode=context.PYNATIVE_MODE, save_graphs=False, device_target='CPU')

In [3]:
def create_mindrecord(name:str):
    test_data = json.load(open(data_dir+name+".json", "r"))
    print("load done")
    # load数据
    writer = FileWriter(data_dir+name+".mindrecord", shard_num=1, overwrite=True)
    # 设置schema
    schema_json = {
        "ques_emb": {"type": "float32", "shape": [512]}, 
        "img_emb": {"type": "float32", "shape": [512]}, 
        "label": {"type": "int32"},
        }
    writer.add_schema(schema_json, name+"_schema")
    # 处理数据
    data_list = []
    for ques_emb, img_emb, label in zip(test_data["ques_emb"], test_data["img_emb"], test_data["answer"]):
        data_json = {
            "ques_emb": np.array(ques_emb),
            "img_emb": np.array(img_emb),
            "label": int(label)
        }
        data_list.append(data_json)

    writer.write_raw_data(data_list)
    writer.commit()
    print("finish", len(data_list))

In [6]:
create_mindrecord("test")

load done
finish 21435


In [22]:
create_mindrecord("trainval")

load done
finish 44506
