# Get data

In [None]:
import pandas as pd

train_df = pd.read_csv("clean_train.csv", index_col=0)
train_df.head()

Unnamed: 0,image_name,BoxesString,domain
0,7b73239dfd89b06c03e1be81cc5074ec47ae048305ec63...,949 967 999 994;368 649 443 685;118 280 185 33...,0
1,0e37ccf64ccbd456f07dcfe110133018f324e5bdf63b08...,691 60 730 119;838 901 897 969;477 444 526 489...,0
2,89e0aa4148f0a9ff01e9e55c5a2bcbe294150444dbe43c...,953 178 1023 244;265 164 326 228;657 790 689 8...,0
3,fed8dde94c7a51de782b71f45c9115cdf7208cf7f33eee...,801 402 882 459;214 924 271 1023;299 726 360 7...,0
4,8a593fc7894c37145d302bce36822bf21725088cf17b23...,187 887 294 1003;631 343 679 386;337 719 402 7...,0


In [None]:
len(train_df["domain"])

3655

In [None]:
train_df["domain"].value_counts()

12    747
15    588
5     448
17    432
14    401
10    204
2     176
6     160
16     98
9      82
13     66
1      60
7      60
8      32
11     30
0      27
4      24
3      20
Name: domain, dtype: int64

In [None]:
count_dict = dict(train_df["domain"].value_counts())

In [None]:
len(count_dict)

18

# Find a split strategy
A strategy that balances the 4 folds using random search.

In [None]:
import random


def get_split(bound=10):
    target = 3655 / 4

    while True:
        domain_list = [i for i in range(18)]
        domains_0 = []
        domains_1 = []
        domains_2 = []
        domains_3 = []
        for _ in range(4):
            domain = random.choice(domain_list)
            domain_list.remove(domain)
            domains_0.append(domain)
        for _ in range(4):
            domain = random.choice(domain_list)
            domain_list.remove(domain)
            domains_1.append(domain)
        for _ in range(5):
            domain = random.choice(domain_list)
            domain_list.remove(domain)
            domains_2.append(domain)
        for _ in range(5):
            domain = random.choice(domain_list)
            domain_list.remove(domain)
            domains_3.append(domain)

    def get_count(l):
        result = 0
        for element in l:
            result += count_dict[element]
        return target - result

    if (
        -bound < get_count(domains_0) < bound
        and -bound < get_count(domains_1) < bound
        and -bound < get_count(domains_2) < bound
        and -bound < get_count(domains_3) < bound
    ):
        print(domains_0, domains_1, domains_2, domains_3)
        print(
            get_count(domains_0),
            get_count(domains_1),
            get_count(domains_2),
            get_count(domains_3),
        )
        break

In [None]:
get_split(3)

[15, 13, 6, 16] [12, 9, 1, 0] [3, 17, 11, 14, 8] [2, 7, 5, 4, 10]
1.75 -2.25 -1.25 1.75


In [None]:
domains_0, domains_1, domains_2, domains_3 = (
    [15, 13, 6, 16],
    [12, 9, 1, 0],
    [3, 17, 11, 14, 8],
    [2, 7, 5, 4, 10],
)

In [51]:
indexes_0 = []
for domain in domains_0:
    indexes_0 += list(train_df.index[train_df["domain"] == domain])
len(indexes_0)

912

In [52]:
indexes_1 = []
for domain in domains_1:
    indexes_1 += list(train_df.index[train_df["domain"] == domain])

In [53]:
indexes_2 = []
for domain in domains_2:
    indexes_2 += list(train_df.index[train_df["domain"] == domain])

In [54]:
indexes_3 = []
for domain in domains_3:
    indexes_3 += list(train_df.index[train_df["domain"] == domain])

In [64]:
all_indexes = set(indexes_0 + indexes_1 + indexes_2 + indexes_3)

In [67]:
train_indexes = [
    list(all_indexes - set(indexes_0)),
    list(all_indexes - set(indexes_1)),
    list(all_indexes - set(indexes_2)),
    list(all_indexes - set(indexes_3)),
]

In [73]:
val_indexes = [indexes_0, indexes_1, indexes_2, indexes_3]

In [74]:
len(train_indexes[0] + val_indexes[0])

3655

# YOLOv5

In [76]:
def encode_boxes(entry):
    result = []
    boxes = entry["BoxesString"]
    if boxes == "no_box":
        return [""]
    boxes_list = [box.split(" ") for box in boxes.split(";")]
    for box in boxes_list:
        x1, y1, x2, y2 = map(int, box)
        x_center = (x1 + x2) / 2048
        y_center = (y1 + y2) / 2048
        width = (x2 - x1) / 1024
        height = (y2 - y1) / 1024
        temp = " ".join(["0", str(x_center), str(y_center), str(width), str(height)])
        if temp not in result:
            result.append(temp)
    return result

In [77]:
import os
import shutil


def create_dir(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.mkdir(dir_path)

In [79]:
for index in range(4):
    create_dir(str(index))
    os.mkdir(str(index) + "/images")
    os.mkdir(str(index) + "/labels")
    os.mkdir(str(index) + "/images/train")
    os.mkdir(str(index) + "/images/val")
    os.mkdir(str(index) + "/labels/train")
    os.mkdir(str(index) + "/labels/val")

    current = train_df.iloc[train_indexes[index]]
    with open(str(index) + "/train.txt", "w") as train:
        for _, entry in current.iterrows():
            train.write(entry["image_name"] + ".png")
            train.write("\n")
            new_name = entry["image_name"] + ".txt"
            new_path = os.path.join(str(index) + "/labels/train", new_name)
            with open(new_path, "w") as f:
                for box in encode_boxes(entry):
                    f.write(box)
                    f.write("\n")

    current = train_df.iloc[val_indexes[index]]
    with open(str(index) + "/val.txt", "w") as val:
        for _, entry in current.iterrows():
            val.write(entry["image_name"] + ".png")
            val.write("\n")
            new_name = entry["image_name"] + ".txt"
            new_path = os.path.join(str(index) + "/labels/val", new_name)
            with open(new_path, "w") as f:
                for box in encode_boxes(entry):
                    f.write(box)
                    f.write("\n")