# Download KITTI Dataset
#### Download KITTI Labels

In [None]:
!wget https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_label_2.zip 

#### Download KITTI images

In [None]:
!wget https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_image_2.zip

#### Unzip

In [None]:
!mkdir KITTI
!unzip data_object_label_2.zip ./KITTI/
!unzip data_object_image_2.zip ./KITTI/

Images are saved in KITTI\training\image_2\, with labels saved in KITTI\training\label_2\. For every image there is a .png and a .txt file containing the labels. The labels are in a special KITTI format (corner coordinates) and converted to the YOLO format (center coordinates)

# Converting the dataset
We create two seperate datasets: the 8-class dataset that is unbalanced, and the 2-class dataset that only contains pedestrians and cars.

In [None]:
PRETRAIN_SET = 200
DEVICES = 9

### Converting the 2-Class dataset

In [None]:
KITTI_DIRECTORY = './KITTI/training/'
YOLO_DIRECTORY = './KITTI/yolo2/'
CLASS_MAPPING = {'Car': "0", 'Pedestrian': "1"}
VALIDATION_SPLIT = 0.30

Create folders for 2-class dataset

In [None]:
import os
os.makedirs(YOLO_DIRECTORY)
os.makedirs(YOLO_DIRECTORY + 'clients/')
os.makedirs(YOLO_DIRECTORY + 'train/images/')
os.makedirs(YOLO_DIRECTORY + 'train/labels/')
os.makedirs(YOLO_DIRECTORY + 'val/images/')
os.makedirs(YOLO_DIRECTORY + 'val/labels/')

Create split for training and validationset

In [None]:
import random
import math
random.seed(11111)
labels = os.listdir(KITTI_DIRECTORY+"label_2/")
random.shuffle(labels)
split_index = math.floor(len(labels)*VALIDATION_SPLIT)
validation = labels[:split_index]
train = labels[split_index:]
print('Split dataset into {} training items and {} validation items'.format(len(train), len(validation)))

Convert KITTI format to YOLO xywh format

In [None]:
def transform_label(label_file, image_width, image_height):
    with open(label_file, 'r') as labelfile:
        coords = []
        all_labels = []
        for line in labelfile:
            l = line.split(" ")
            # If the class is unknown, don't include in label file.
            if not l[0] in CLASS_MAPPING:
                continue
            
            # Convert coordinates to yolo xywh
            coords = list(map(int, map(float, l[4:8])))
            x = float((float(coords[2]) + float(coords[0])) / 2.0) / float(image_width)
            y = float((float(coords[3]) + float(coords[1])) / 2.0) / float(image_height)
            width = float(float(coords[2]) - float(coords[0])) / float(image_width)
            height = float(float(coords[3]) - float(coords[1])) / float(image_height)
            all_labels.append((CLASS_MAPPING[l[0]], [x, y, width, height]))
    return all_labels

Use Python Image Library (PIL) to transform the images from png to jpg, to save space and allow more images to be cached.

In [None]:
from PIL import Image
def transform_kitti_file(file, folder_prefix):
    fname = (KITTI_DIRECTORY + "image_2/" + file).split(".txt")[0] + ".png"
    if os.path.isfile(fname):
        img = Image.open(fname)
        img.save(YOLO_DIRECTORY + folder_prefix + "/images/" + file.split(".txt")[0] + ".jpg", "jpeg")
        labels = transform_label(os.path.join(KITTI_DIRECTORY + "label_2/" + file), img.size[0], img.size[1])
        if len(labels) == 0:
            return
        with open(YOLO_DIRECTORY + folder_prefix + "/labels/" + file, "a+") as yolofile:
            for l, c, in labels:
                yolofile.write(l + " " + str(c[0]) + " " + str(c[1]) + " " + str(c[2]) + " " + str(c[3]) + "\n")
    else: 
        print('Image not found for {}'.format(fname))

### Transform Validation and Train sets:

In [None]:
for f in validation:
    transform_kitti_file(f, 'val')
for f in train:
    transform_kitti_file(f, 'train')

#### Create overview of all files

In [None]:
import glob
with open(YOLO_DIRECTORY + "train_all.txt", "w") as f_train:
    for filename in glob.glob(os.path.join(YOLO_DIRECTORY + "train/labels/", "*.*")):
        f_train.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))
    
with open(YOLO_DIRECTORY + "val.txt", "w") as f_val:
    for filename in glob.glob(os.path.join(YOLO_DIRECTORY + "val/labels/", "*.*")):
        f_val.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))

### Create pretrain set

In [None]:
random.seed(11111)
train_all = glob.glob(os.path.join(YOLO_DIRECTORY + "train/labels/", "*.*"))
random.shuffle(train_all)
with open(YOLO_DIRECTORY + "pretrain.txt", "w") as f_prefl:
    for filename in train_all[:PRETRAIN_SET]:
        f_prefl.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))

### Split files over artificial devices

In [None]:
def split_integer(num, parts):
    quotient, remainder = divmod(num, parts)
    lower_elements = [quotient for i in range(parts - remainder)]
    higher_elements = [quotient + 1 for j in range(remainder)]
    return lower_elements + higher_elements

In [None]:
cumulative = PRETRAIN_SET
for i,j in zip(range(0,DEVICES), split_integer(len(train_all)-PRETRAIN_SET, DEVICES)):
    print("Device {} is receiving {} samples, total: {}/{}".format(i, j, cumulative-PRETRAIN_SET, len(train_all)-PRETRAIN_SET))
    with open(YOLO_DIRECTORY + "clients/{}.txt".format(i), "w") as f_prefl:
        for filename in train_all[cumulative:(cumulative+j)]:
            f_prefl.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))
    cumulative += j
    

### Converting the 8-Class dataset

In [None]:
KITTI_DIRECTORY = './KITTI/training/'
YOLO_DIRECTORY = './KITTI/yolo8/'
CLASS_MAPPING = {'Car': "0",
 'Cyclist': "1",
 'Misc': "2",
 'Pedestrian': "3",
 'Person_sitting': "4",
 'Tram': "5",
 'Truck': "6",
 'Van': "7"}
VALIDATION_SPLIT = 0.30

Create folders for 8-class dataset

In [None]:
import os
os.makedirs(YOLO_DIRECTORY)
os.makedirs(YOLO_DIRECTORY + 'clients/')
os.makedirs(YOLO_DIRECTORY + 'train/images/')
os.makedirs(YOLO_DIRECTORY + 'train/labels/')
os.makedirs(YOLO_DIRECTORY + 'val/images/')
os.makedirs(YOLO_DIRECTORY + 'val/labels/')

Create split for training and validationset

In [None]:
import random
import math
random.seed(11111)
labels = os.listdir(KITTI_DIRECTORY+"label_2/")
random.shuffle(labels)
split_index = math.floor(len(labels)*VALIDATION_SPLIT)
validation = labels[:split_index]
train = labels[split_index:]
print('Split dataset into {} training items and {} validation items'.format(len(train), len(validation)))

#### Transform

In [None]:
for f in validation:
    transform_kitti_file(f, 'val')
for f in train:
    transform_kitti_file(f, 'train')

#### Create overview of all files

In [None]:
import glob
with open(YOLO_DIRECTORY + "train_all.txt", "w") as f_train:
    for filename in glob.glob(os.path.join(YOLO_DIRECTORY + "train/labels/", "*.*")):
        f_train.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))
    
with open(YOLO_DIRECTORY + "val.txt", "w") as f_val:
    for filename in glob.glob(os.path.join(YOLO_DIRECTORY + "val/labels/", "*.*")):
        f_val.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))

### Create pretrain set

In [None]:
random.seed(11111)
train_all = glob.glob(os.path.join(YOLO_DIRECTORY + "train/labels/", "*.*"))
random.shuffle(train_all)
with open(YOLO_DIRECTORY + "pretrain.txt", "w") as f_prefl:
    for filename in train_all[:PRETRAIN_SET]:
        f_prefl.write('%s\n' % (filename).replace('labels', 'images').replace('.txt', '.jpg'))

### Split over devices
The device split is significantly more difficult due to the need for an unbalanced dataset. We define the deficiencies for each device below:

In [None]:
CLASS_DEFICIENCIES = {0: {'Cyclist'}, 1:{'Person_sitting'}, 2:{'Cyclist', 'Person_sitting'}, 3: {'Tram', 'Person_sitting'}, 
                      4: {'Pedestrian', 'Truck'}, 5: {'Truck', 'Cyclist'}, 6: {'Tram', 'Cyclist'}, 7: {'Pedestrian'}, 8: {'Pedestrian', 'Cyclist'}}

Utility functions to distribute the samples over the devices, such that each device has roughly the same amount of images.

In [None]:
def distribute(samples, devices):
    base, extra = divmod(samples, devices)
    return [(base + (i < extra),i) for i in range(devices)]

def determine_how_many_samples_every_device_should_get(device_samples_original, samples_to_give_away):
    equalization_index = 0
    device_samples = device_samples_original.copy()
    while equalization_index < len(device_samples):
        sorted_dict = {k: v for k, v in sorted(device_samples.items(), key=lambda item: item[1], reverse=False)}
        if equalization_index + 1 < len(device_samples):
            parent = list(sorted_dict.values())[equalization_index+1]
            me = list(sorted_dict.values())[equalization_index]
            diff = parent-me
#             print(f"equalization_index: {equalization_index}, me: {me}, parent: {parent}, diff: {diff}")
            if diff == 0:
                equalization_index +=1
                continue
            elif diff*(equalization_index+1) < samples_to_give_away:
                for i in range(equalization_index+1):
                    device_samples[list(sorted_dict.keys())[i]] += diff
                samples_to_give_away -= diff*(equalization_index+1)
            else: 
                for samples, dev in distribute(samples_to_give_away, equalization_index+1):
                    device_samples[list(sorted_dict.keys())[dev]] += samples
                break
        else: 
            for samples, dev in distribute(samples_to_give_away, equalization_index+1):
                device_samples[list(sorted_dict.keys())[dev]] += samples
            
        equalization_index +=1
    return {key: device_samples[key] - device_samples_original.get(key, 0) for key in device_samples}

For convenience, we reuse the original KITTI labels (as they still have string names, otherwise just use class_mapping)

In [None]:
import glob
labels_not_in_validation = [x for x in os.listdir(KITTI_DIRECTORY + 'label_2/') if x not in validation]

Create a mapping of every label file to the classes it contains: e.g.
 ```'000000.txt': {'Pedestrian'},
 '000002.txt': {'Car', 'Misc'},
 '000003.txt': {'Car'},```

In [None]:
file_class_mapping = {}
for filename in labels_not_in_validation: 
    with open(KITTI_DIRECTORY + 'label_2/' + filename, 'r') as file:
        file_class_mapping[filename] = set()
        for line in file.readlines():
            if line.split(" ")[0] == "DontCare":
                continue
            file_class_mapping[filename].add(line.split(" ")[0])
len(file_class_mapping)

In [None]:
from collections import Counter
c = Counter(frozenset(file_class_mapping[file]) for file in file_class_mapping)

Use the mapping to distribute classes to each device. E.g. given_classes contains a key for each device, where each device contains a dictionary. The keys in these dictionaries are the set of classes that occur in a label file (which can occur in multiple label files) and the amount of images the device gets from that set. 
I.e.
```0: {frozenset({'Car', 'Misc', 'Pedestrian', 'Truck', 'Van'}): 1,```
means device 0 gets 1 image from a file that has labels 'Car', 'Misc', 'Pedestrian', 'Truck' and 'Van'. Only combinations that occur in the label files are used.

In [None]:
class_set_mapping = {}
lisz = c.most_common()
lisz.reverse()
given_classes = {x: {} for x in CLASS_DEFICIENCIES}
for classes, classes_count in lisz:
    devices_it_can_be_distributed_over = [x for x in CLASS_DEFICIENCIES if not (CLASS_DEFICIENCIES[x] & classes)]
    counts = {device: sum(given_classes[device].values()) for device in devices_it_can_be_distributed_over} 
    res = determine_how_many_samples_every_device_should_get(counts, classes_count)
    for x in res:
        if res[x] != 0:
            given_classes[x][classes] = res[x]
given_classes

Now create a mapping back from the classes to the files they occur in:

In [None]:
set_to_filenr = {}
for file in file_class_mapping:
    if frozenset(file_class_mapping[file]) not in set_to_filenr:
        set_to_filenr[frozenset(file_class_mapping[file])] = []
    set_to_filenr[frozenset(file_class_mapping[file])].append(file)

Use this mapping to denote which files will go to which device id's:

In [None]:
from random import Random
randgen = Random(11111)
device_files = {}
for device in given_classes:
    device_files[device] = []
    for class_samples in given_classes[device]:
        samples = given_classes[device][class_samples]
        for imgid in set_to_filenr[class_samples][:samples]:
            device_files[device].append(YOLO_DIRECTORY + 'train/images/' + imgid.split(".txt")[0]+".jpg\n")
        set_to_filenr[class_samples] = set_to_filenr[class_samples][samples:]

And distribute the files to the actual devices:

In [None]:
for device in device_files:
    with open(f"{YOLO_DIRECTORY}/clients/{device}.txt", "w") as f:
        randgen.shuffle(device_files[device])
        for file in device_files[device]:
            f.write(file)

To see which devices got which labels:

In [None]:
dev_counts = {}
for device in given_classes:
    counts = {}
    for sett in given_classes[device]:
        for clasz in sett:
            if clasz not in counts:
                counts[clasz] = 0
            counts[clasz] += given_classes[device][sett]
    dev_counts[device] = counts
dev_counts

### We've now created the 2-class dataset and the 8-class dataset and distributed it over virtual devices. We now continue with the experiments, check the next ipynb notebook. You can delete the files in ./KITTI/training/ to save disk space.