In [55]:
import os
import collections
import shutil
import json
import numpy as np

In [73]:
path = "CN/A001"
seed = 42
train_ratio = 1
dst_path_prefix = "CN_A001_"

## Count the total number of files

Given the path to a folder which may contain subfolders, count the total number of files in this folder.

In [74]:
def get_number_of_files(path):
    files = os.listdir(path)
    num_of_files = 0
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            num_of_files += get_number_of_files(path_to_file)
        else:
            num_of_files += 1
    return num_of_files

In [75]:
print(get_number_of_files(path))

8129


## Get unique IDs (i.e. different products)

Assume the filename starts with a unique ID (which could be SKU, EAN, etc) followed by underscore. Given the path of a folder which may contain subfolders, get unique IDs (i.e., different products) in this folder.

In [76]:
def get_unique_ids(path):
    uids = set()
    get_unique_ids_help(path, uids)
    return uids

def get_unique_ids_help(path, uids):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_unique_ids_help(path_to_file, uids)
        else:
            uids.add(file.split("_")[0])

In [77]:
print(len(get_unique_ids(path)))

127


## Get mapping from uid + description to file location

When products are differentiated by both EAN and description, we map uid and description to file location. Assume uid and description are the first two parts in filename separated by underscore. 

In [78]:
def get_mapping_from_uid_description_to_location(path):
    uid_description_to_location = collections.defaultdict(list)
    get_mapping_from_uid_description_to_location_help(path, uid_description_to_location)
    return uid_description_to_location

def get_mapping_from_uid_description_to_location_help(path, uid_description_to_location):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_mapping_from_uid_description_to_location_help(path_to_file, uid_description_to_location)
        else:
            s = file.split("_")
            uid_description_to_location[s[0] + "_" + s[1]].append(path_to_file)

In [79]:
uid_description_to_location = get_mapping_from_uid_description_to_location(path)
assert len(uid_description_to_location.keys()) >= len(get_unique_ids(path))
assert sum([len(x) for x in uid_description_to_location.values()]) == get_number_of_files(path)

## Train test split

Given the path of a folder which may contain subfolders, split files of each product into two parts, one for training and the other for testing, with a specified ratio. Assume that products with the same EAN may have different descriptions. We differentiate products by both EAN and description.

In [80]:
def train_test_split_ean_description(path, train_ratio=0.9, seed=43):
    """
    params:
    ratio: the proportion of training data
    """
    np.random.seed(seed)
    uid_description_to_location = get_mapping_from_uid_description_to_location(path)
    train_uid, train_description, train_file_location = [], [], []
    test_uid, test_description, test_file_location = [], [], []
    for uid_description in uid_description_to_location.keys():
        s = uid_description.split("_")
        uid = s[0]
        description = s[1]
        n = len(uid_description_to_location[uid_description])
        k = int(n * train_ratio)
        index = np.random.choice(n, k, replace=False)
        train_uid.extend([uid] * k)
        train_description.extend([description] * k)
        train_file_location.extend([uid_description_to_location[uid_description][_] for _ in index])
        test_uid.extend([uid] * (n - k))
        test_description.extend([description] * (n - k))
        test_file_location.extend([uid_description_to_location[uid_description][_] for _ in set(range(n)) - set(index)])
    return train_uid, train_description, train_file_location, test_uid, test_description, test_file_location

In [81]:
train_uid, train_description, train_file_location, test_uid, test_description, test_file_location = train_test_split_ean_description(path, train_ratio, seed)

In [82]:
len(train_uid)

8129

In [83]:
len(test_uid)

0

In [84]:
len(train_description)

8129

In [85]:
len(set(train_description))

134

## Prepare datasets for fine-tuning GLM-4V-9B

The dataset is of format List[Dict[str, Any]]. Refer to https://github.com/modelscope/swift/blob/main/swift/utils/np_utils.py

In [91]:
def create_json_file_glm_4v_9b(dst, uid, description, file_location, query, mode):
    data = []
    for i in range(len(uid)):
        dic = {}
        dic["query"] = query
        if mode == "ean":
            dic["response"] = uid[i]
        elif mode == "description":
            dic["response"] = description[i]
        elif mode == "description_ean":
            dic["response"] = "商品名称是" + description[i] + ", 编号是" + uid[i]
        else:
            raise Exception("Invalid choise!") 
        dic["images"] = [file_location[i]]
        data.append(dic)
    with open(dst, 'w', encoding='utf-8') as out_file:
        json.dump(data, out_file, ensure_ascii=False)

In [88]:
mode = "description"
query = "图中商品的名称是什么"
filename = dst_path_prefix + "train_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, train_uid, train_description, train_file_location, query, mode)
filename = dst_path_prefix + "test_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, test_uid, test_description, test_file_location, query, mode)

In [89]:
mode = "ean"
query = "图中商品的编号是什么"
filename = dst_path_prefix + "train_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, train_uid, train_description, train_file_location, query, mode)
filename = dst_path_prefix + "test_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, test_uid, test_description, test_file_location, query, mode)

In [None]:
mode = "description_ean"
query = "图中商品的名称和编号是什么"
filename = dst_path_prefix + "train_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, train_uid, train_description, train_file_location, query, mode)
filename = dst_path_prefix + "test_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + ".json"
create_json_file_glm_4v_9b(filename, test_uid, test_description, test_file_location, query, mode)

In [92]:
mode = "description_ean"
query = "图中商品的名称和编号是什么"
filename = dst_path_prefix + "train_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + "_v2.json"
create_json_file_glm_4v_9b(filename, train_uid, train_description, train_file_location, query, mode)
filename = dst_path_prefix + "test_glm_4v_9b_" + mode + "_" + str(seed) + "_" + str(int(train_ratio * 100)) + "_v2.json"
create_json_file_glm_4v_9b(filename, test_uid, test_description, test_file_location, query, mode)