In [21]:
import os
import collections
import shutil
import json
import numpy as np

## Count the total number of files

Given the path to a folder which may contain subfolders, count the total number of files in this folder.

In [2]:
def get_number_of_files(path):
    files = os.listdir(path)
    num_of_files = 0
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            num_of_files += get_number_of_files(path_to_file)
        else:
            num_of_files += 1
    return num_of_files

In [3]:
print(get_number_of_files("CN"))

596457


In [4]:
print(get_number_of_files("CN/A001"))

8129


## Get the name of all files

Given the path to a folder which may contain subfolders, get the name of all files in this folder.

In [5]:
def get_filename(path):
    filenames = []
    get_filename_help(path, filenames)
    return filenames

def get_filename_help(path, filenames):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_filename_help(path_to_file, filenames)
        else:
            filenames.append(file)

In [6]:
filenames_A001 = get_filename("CN/A001")
num_of_files_A001 = len(filenames_A001)
num_of_files_A001

8129

In [None]:
filenames_A001[:10]

In [8]:
filenames_CN = get_filename("CN")
num_of_files_CN = len(filenames_CN)
num_of_files_CN

596457

## Get unique IDs (i.e. different products)

Assume the filename starts with a unique ID (which could be SKU, EAN, etc) followed by underscore. Given the path of a folder which may contain subfolders, get unique IDs (i.e., different products) in this folder.

In [9]:
def get_unique_ids(path):
    uids = set()
    get_unique_ids_help(path, uids)
    return uids

def get_unique_ids_help(path, uids):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_unique_ids_help(path_to_file, uids)
        else:
            uids.add(file.split("_")[0])

In [10]:
uids_CN = get_unique_ids("CN")
number_of_products_CN = len(uids_CN)
number_of_products_CN

22475

In [11]:
uids_A001 = get_unique_ids("CN/A001")
number_of_products_A001 = len(uids_A001)
number_of_products_A001

127

## Get the mapping from unique ID to product description

Assume the filename starts with a unique ID (which could be SKU, EAN, etc) followed by underscore and product description. Given the path of a folder which may contain subfolders, get the mapping from unique IDs to product description. The reason is that the unique ID here may be arbitrary, neither SKU nor EAN. Also assume that products with the same EAN have the same description.

In [12]:
def get_mapping_from_uid_to_description(path):
    uid_to_description = {}
    get_mapping_from_uid_to_description_help(path, uid_to_description)
    return uid_to_description

def get_mapping_from_uid_to_description_help(path, uid_to_description):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_mapping_from_uid_to_description_help(path_to_file, uid_to_description)
        else:
            s = file.split("_")
            if s[0] in uid_to_description:
                continue
            uid_to_description[s[0]] = s[1]

In [13]:
uid_to_description_CN = get_mapping_from_uid_to_description("CN")
assert len(uid_to_description_CN.keys()) == number_of_products_CN

In [14]:
uid_to_description_A001 = get_mapping_from_uid_to_description("CN/A001")
assert len(uid_to_description_A001.keys()) == number_of_products_A001

In [None]:
for k, v in list(uid_to_description_A001.items())[:5]:
    print(k + ": " + v)

## Get mapping from unique ID to file location

Assume the filename starts with a unique ID (which could be SKU, EAN, etc) followed by underscore. Given the path of a folder which may contain subfolders, get the mapping from unique ID to a list of locations of the files containing the same product, so that we can split the files between training and testing for each product.

In [16]:
def get_mapping_from_uid_to_file_location(path):
    uid_to_file_locations = collections.defaultdict(list)
    get_mapping_from_uid_to_file_location_help(path, uid_to_file_locations)
    return uid_to_file_locations

def get_mapping_from_uid_to_file_location_help(path, uid_to_file_locations):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_mapping_from_uid_to_file_location_help(path_to_file, uid_to_file_locations)
        else:
            s = file.split("_")
            uid_to_file_locations[s[0]].append(path_to_file)

In [20]:
uid_to_file_locations_A001 = get_mapping_from_uid_to_file_location("CN/A001")
assert len(uid_to_file_locations_A001.keys()) == number_of_products_A001
assert sum([len(x) for x in uid_to_file_locations_A001.values()]) == num_of_files_A001

In [None]:
for k, v in list(uid_to_file_locations_A001.items())[:2]:
    print(k + ": ")
    for x in v:
        print(x)

## Issue of the data

#### Same EAN but different description

The following two items have the same EAN but are in two subfolders and have different product description.

- CN/A001/6959764600817_依能苏打水饮料/6959764600817_依能苏打水饮料_A_A001_15545.jpg
- CN/A001/6959764600817_依能加锌苏打水500毫升/6959764600817_依能加锌苏打水500毫升_A_A001_107505006.jpg

之前提到的油漆，不同颜色的油漆是否具有相同的EAN？

Should we treat products with the same EAN but different descriptions differently? They are currently treated as the same.

#### Fake EAN

Some photos are annotated with fake rather than real EAN.

#### Undefined description

Some photos have undefined description. They are currently treated normally.

## Train/test split

Given the path of a folder which may contain subfolders, split files of each product into two parts, one for training and the other for testing, with a specified ratio. Assume that products with the same EAN have the same description. That is, products are only differentiated by EAN.

In [164]:
def train_test_split_ean(path, train_ratio=0.7, seed=43):
    """
    params:
    ratio: the proportion of training data
    """
    np.random.seed(seed)
    uid_to_file_location = get_mapping_from_uid_to_file_location(path)
    uid_to_description = get_mapping_from_uid_to_description(path)
    train_uid, train_description, train_file_location = [], [], []
    test_uid, test_description, test_file_location = [], [], []
    for uid in uid_to_file_location.keys():
        n = len(uid_to_file_location[uid])
        k = int(n * train_ratio)
        index = np.random.choice(n, k, replace=False)
        train_uid.extend([uid] * k)
        train_description.extend([uid_to_description[uid]] * k)
        train_file_location.extend([uid_to_file_location[uid][_] for _ in index])
        test_uid.extend([uid] * (n - k))
        test_description.extend([uid_to_description[uid]] * (n - k))
        test_file_location.extend([uid_to_file_location[uid][_] for _ in set(range(n)) - set(index)])
    return train_uid, train_description, train_file_location, test_uid, test_description, test_file_location

In [165]:
def test_train_test_split(path, train_uid, train_description, train_file_location, test_uid, test_description, test_file_location):
    num_of_files = get_number_of_files(path)
    assert len(train_uid) + len(test_uid) == num_of_files
    assert len(train_description) + len(test_description) == num_of_files
    assert len(train_file_location) + len(test_file_location) == num_of_files
    assert len(train_uid) == len(train_description) == len(train_file_location)
    assert len(test_uid) == len(test_description) == len(test_file_location)

In [185]:
train_uid, train_description, train_file_location, test_uid, test_description, test_file_location = train_test_split_ean("CN/A001")
test_train_test_split("CN/A001", train_uid, train_description, train_file_location, test_uid, test_description, test_file_location)

In [186]:
len(train_uid)

5627

In [187]:
len(test_uid)

2502

In [188]:
len(set(train_description))

118

## Get mapping from uid + description to file location

When products are differentiated by both EAN and description, we map uid and description to file location. Assume uid and description are the first two parts in filename separated by underscore. 

In [170]:
def get_mapping_from_uid_description_to_location(path):
    uid_description_to_location = collections.defaultdict(list)
    get_mapping_from_uid_description_to_location_help(path, uid_description_to_location)
    return uid_description_to_location

def get_mapping_from_uid_description_to_location_help(path, uid_description_to_location):
    files = os.listdir(path)
    for file in files:
        path_to_file = os.path.join(path, file)
        if os.path.isdir(path_to_file):
            get_mapping_from_uid_description_to_location_help(path_to_file, uid_description_to_location)
        else:
            s = file.split("_")
            uid_description_to_location[s[0] + "_" + s[1]].append(path_to_file)

In [173]:
uid_description_to_location_A001 = get_mapping_from_uid_description_to_location("CN/A001")
assert len(uid_description_to_location_A001.keys()) >= number_of_products_A001
assert sum([len(x) for x in uid_description_to_location_A001.values()]) == num_of_files_A001

In [None]:
for k, v in list(uid_description_to_location_A001.items())[:2]:
    print(k + ": ")
    for x in v:
        print(x)

## Train test split

Given the path of a folder which may contain subfolders, split files of each product into two parts, one for training and the other for testing, with a specified ratio. Assume that products with the same EAN may have different descriptions. We differentiate products by both EAN and description.

In [175]:
def train_test_split_ean_description(path, train_ratio=0.7, seed=43):
    """
    params:
    ratio: the proportion of training data
    """
    np.random.seed(seed)
    uid_description_to_location = get_mapping_from_uid_description_to_location(path)
    train_uid, train_description, train_file_location = [], [], []
    test_uid, test_description, test_file_location = [], [], []
    for uid_description in uid_description_to_location.keys():
        s = uid_description.split("_")
        uid = s[0]
        description = s[1]
        n = len(uid_description_to_location[uid_description])
        k = int(n * train_ratio)
        index = np.random.choice(n, k, replace=False)
        train_uid.extend([uid] * k)
        train_description.extend([description] * k)
        train_file_location.extend([uid_description_to_location[uid_description][_] for _ in index])
        test_uid.extend([uid] * (n - k))
        test_description.extend([description] * (n - k))
        test_file_location.extend([uid_description_to_location[uid_description][_] for _ in set(range(n)) - set(index)])
    return train_uid, train_description, train_file_location, test_uid, test_description, test_file_location

In [189]:
train_uid, train_description, train_file_location, test_uid, test_description, test_file_location = train_test_split_ean_description("CN/A001")
test_train_test_split("CN/A001", train_uid, train_description, train_file_location, test_uid, test_description, test_file_location)

In [190]:
len(train_uid)

5620

In [191]:
len(test_uid)

2509

In [192]:
len(train_description)

5620

In [193]:
len(set(train_description))

133

## Save to json

In [194]:
def create_json_file(dst, uid, description, file_location):
    data = {}
    data["uid"] = uid
    data["description"] = description
    data["file_location"] = file_location
    with open(dst, 'w', encoding='utf-8') as out_file:
        json.dump(data, out_file, ensure_ascii=False)

In [195]:
create_json_file("A001_train_ean_desc.json", train_uid, train_description, train_file_location)

In [196]:
create_json_file("A001_test_ean_desc.json", test_uid, test_description, test_file_location)

## Load json file

In [199]:
file = open('A001_train.json', 'r', encoding='utf-8')
train_ean_A001 = json.load(file)
path = "A001_test.json"
test_ean_A001 = json.load(open(path, 'r', encoding='utf-8'))

In [200]:
test_ean_A001.keys()

dict_keys(['uid', 'description', 'file_location'])

In [201]:
train_ean_A001.keys()

dict_keys(['uid', 'description', 'file_location'])

In [202]:
len(test_ean_A001['uid'])

2502

In [203]:
path = "A001_train_ean_desc.json"
train_ean_desc_A001 = json.load(open(path, 'r', encoding='utf-8'))
path = "A001_test_ean_desc.json"
test_ean_desc_A001 = json.load(open(path, 'r', encoding='utf-8'))

In [204]:
train_ean_desc_A001.keys()

dict_keys(['uid', 'description', 'file_location'])

In [206]:
len(train_ean_desc_A001['uid'])

5620

## Clean data


## 微调数据集

- https://zhuanlan.zhihu.com/p/701930953
- https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary

In [30]:
from modelscope.msdatasets import MsDataset
ds = MsDataset.load("coco_2014_caption", namespace="modelscope", split="train")
print(ds[0])

2024-06-14 09:12:50,831 - modelscope - INFO - PyTorch version 2.2.2 Found.
2024-06-14 09:12:50,834 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-06-14 09:12:50,981 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 137712c8ca5ebf0802c740a429d6cceb and a total number of 980 components indexed
  from .autonotebook import tqdm as notebook_tqdm


transformer is not installed, please install it if you want to use related modules


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████████| 14.9M/14.9M [00:12<00:00, 1.17MB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████████| 4.93M/4.93M [00:04<00:00, 1.16MB/s]
Generating train split: 414113 examples [00:13, 31531.41 examples/s]
Generating validation split: 40504 examples [00:01, 30367.29 examples/s]


{'uniq_id': '258768', 'image_id': '11195', 'caption': 'A snow skier assessing the mountain before starting to sky', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x791E21F79CF0>}


In [31]:
ds[:10]

{'uniq_id': ['258768',
  '725439',
  '580123',
  '283575',
  '392682',
  '154143',
  '142231',
  '569327',
  '434237',
  '480315'],
 'image_id': ['11195',
  '367065',
  '213997',
  '370736',
  '524048',
  '427137',
  '579907',
  '45468',
  '35230',
  '284155'],
 'caption': ['A snow skier assessing the mountain before starting to sky',
  'a guy that is brushing his teeth and a baby too',
  'A chair and a fireplace in a room.',
  'A red street sign showing Volt street and Mill street.',
  'A young man standing next to a pile of pallets.',
  'Some sheep walking in the field and having some fun.',
  'A living room complete with couches, television, and fireplace.',
  'a bath room with a toilet and a sink and a pile of trash',
  "Two pizza's salad's and drinks on a table with plates.",
  'A clean kitchen with white cabinets and a black oven'],
 'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
  <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
  <PIL