In [3]:
#pip install numpy Pillow

In [4]:
import os
import tqdm
import numpy as np
from PIL import Image
from collections import namedtuple
from PIL import Image
from collections import namedtuple
import shutil

# Download and Preprocess Cityscapes

You need first to register to download the Cityscapes data: 

https://www.cityscapes-dataset.com/login/

We'll need datasets with ID 1 and 3 (fine annotations only)

Set your password and user name in these three commands:


```
wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=myusername&password=mypassword&submit=Login' https://www.cityscapes-dataset.com/login/
```

```
wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=pjamscik&password=9g@@sYe9zRBsrpQ&submit=Login' https://www.cityscapes-dataset.com/login/

```

```
wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=3
```

Next unzip both zipped folders:

```
unzip leftImg8bit_trainvaltest.zip
unzip gtFine_trainvaltest.zip 
```

We refer to the folder in which the data has been unzipped as `$RAW_DATAPATH`.

The target folder where we will store the preprocessed png files `$PROC_DATAPATH`.

After running this script we expect (a) **20 ground truth classes** and (b) `$PROC_DATAPATH` mapping to proc/, and a subdirectory structure as follows:

```

proc/
  data/
       train/
             -  file_name_1.png
             -  file_name_2.png
       val/
           - ...
       test/
           - ...
  labels/
       train/
             -  file_name_1.png
             -  file_name_2.png
       val/
           -...
       test/

````

Data and labels will have the same file names, it is their parent folder that allows to differentiate between them.


## Cityscapes Conventions

We'll make use of 20 non-void classes as suggested here:

https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py

In [5]:
# NOTE: currently not optimized for speed since one-time cost; possible speed-ups are:
# (a) multiprocessing
# (b) not looping through images one by one when copying images from raw to meta
# (c) calculating class weights and normalization constant on the go (instead of re-reading them into memory)

################################
# GROUND TRUTH LABELS CITYSCAPES
################################

# (NOTE! this is taken from the official Cityscapes scripts:)
Label = namedtuple('Label', [

    'name',  # The identifier of this label, e.g. 'car', 'person', ... .
    # We use them to uniquely name a class

    'id',  # An integer ID that is associated with this label.
    # The IDs are used to represent the label in ground truth images
    # An ID of -1 means that this label does not have an ID and thus
    # is ignored when creating ground truth images (e.g. license plate).
    # Do not modify these IDs, since exactly these IDs are expected by the
    # old_evaluation server.

    'trainId',  # Feel free to modify these IDs as suitable for your method. Then create
    # ground truth images with train IDs, using the tools provided in the
    # 'preparation' folder. However, make sure to validate or submit results
    # to our old_evaluation server using the regular IDs above!
    # For trainIds, multiple labels might have the same ID. Then, these labels
    # are mapped to the same class in the ground truth images. For the inverse
    # mapping, we use the label that is defined first in the list below.
    # For example, mapping all void-type classes to the same ID in training,
    # might make sense for some approaches.
    # Max value is 255!

    'category',  # The name of the category that this label belongs to

    'categoryId',  # The ID of this category. Used to create ground truth images
    # on category level.

    'hasInstances',  # Whether this label distinguishes between single instances or not

    'ignoreInEval',  # Whether pixels having this class as ground truth label are ignored
    # during evaluations or not

    'color',  # The color of this label
])

# (NOTE! this is taken from the official Cityscapes scripts:)
labels = [
    #       name                     id    trainId   category            catId     hasInstances   ignoreInEval   color
    Label('unlabeled', 0, 19, 'void', 0, False, True, (0, 0, 0)),
    Label('ego vehicle', 1, 19, 'void', 0, False, True, (0, 0, 0)),
    Label('rectification border', 2, 19, 'void', 0, False, True, (0, 0, 0)),
    Label('out of roi', 3, 19, 'void', 0, False, True, (0, 0, 0)),
    Label('static', 4, 19, 'void', 0, False, True, (0, 0, 0)),
    Label('dynamic', 5, 19, 'void', 0, False, True, (111, 74, 0)),
    Label('ground', 6, 19, 'void', 0, False, True, (81, 0, 81)),
    Label('road', 7, 0, 'flat', 1, False, False, (128, 64, 128)),
    Label('sidewalk', 8, 1, 'flat', 1, False, False, (244, 35, 232)),
    Label('parking', 9, 19, 'flat', 1, False, True, (250, 170, 160)),
    Label('rail track', 10, 19, 'flat', 1, False, True, (230, 150, 140)),
    Label('building', 11, 2, 'construction', 2, False, False, (70, 70, 70)),
    Label('wall', 12, 3, 'construction', 2, False, False, (102, 102, 156)),
    Label('fence', 13, 4, 'construction', 2, False, False, (190, 153, 153)),
    Label('guard rail', 14, 19, 'construction', 2, False, True, (180, 165, 180)),
    Label('bridge', 15, 19, 'construction', 2, False, True, (150, 100, 100)),
    Label('tunnel', 16, 19, 'construction', 2, False, True, (150, 120, 90)),
    Label('pole', 17, 5, 'object', 3, False, False, (153, 153, 153)),
    Label('polegroup', 18, 19, 'object', 3, False, True, (153, 153, 153)),
    Label('traffic light', 19, 6, 'object', 3, False, False, (250, 170, 30)),
    Label('traffic sign', 20, 7, 'object', 3, False, False, (220, 220, 0)),
    Label('vegetation', 21, 8, 'nature', 4, False, False, (107, 142, 35)),
    Label('terrain', 22, 9, 'nature', 4, False, False, (152, 251, 152)),
    Label('sky', 23, 10, 'sky', 5, False, False, (70, 130, 180)),
    Label('person', 24, 11, 'human', 6, True, False, (220, 20, 60)),
    Label('rider', 25, 12, 'human', 6, True, False, (255, 0, 0)),
    Label('car', 26, 13, 'vehicle', 7, True, False, (0, 0, 142)),
    Label('truck', 27, 14, 'vehicle', 7, True, False, (0, 0, 70)),
    Label('bus', 28, 15, 'vehicle', 7, True, False, (0, 60, 100)),
    Label('caravan', 29, 19, 'vehicle', 7, True, True, (0, 0, 90)),
    Label('trailer', 30, 19, 'vehicle', 7, True, True, (0, 0, 110)),
    Label('train', 31, 16, 'vehicle', 7, True, False, (0, 80, 100)),
    Label('motorcycle', 32, 17, 'vehicle', 7, True, False, (0, 0, 230)),
    Label('bicycle', 33, 18, 'vehicle', 7, True, False, (119, 11, 32)),
    Label('license plate', -1, 19, 'vehicle', 7, False, True, (0, 0, 142)),
]

# create a function which maps id to trainId:
id_to_trainId = {label.id: label.trainId for label in labels}
id_to_trainId_map_func = np.vectorize(id_to_trainId.get)

train_dirs = ["jena/", "zurich/", "weimar/", "ulm/", "tubingen/", "stuttgart/",
              "strasbourg/", "monchengladbach/", "krefeld/", "hanover/",
              "hamburg/", "erfurt/", "dusseldorf/", "darmstadt/", "cologne/",
              "bremen/", "bochum/", "aachen/"]
val_dirs = ["frankfurt/", "munster/", "lindau/"]
test_dirs = ["berlin", "bielefeld", "bonn", "leverkusen", "mainz", "munich"]

## Paths 

In [6]:
# don't copy this blindly, but adjust it to your folder strucure 
os.environ['RAW_DATAPATH'] = os.path.join(os.environ['HOME'], 'Data', 'Backup_Orig_Data', 'Cityscapes')
os.environ['PROC_DATAPATH'] = os.path.join(os.environ['HOME'], 'Data', 'Processed_Data', 'Cityscapes')

In [7]:
os.environ['RAW_DATAPATH'], os.environ['PROC_DATAPATH']

('/home/pjamscik/Data/Backup_Orig_Data/Cityscapes',
 '/home/pjamscik/Data/Processed_Data/Cityscapes')

In [8]:
! rm -rf  /home/pjamscik/Data/Processed_Data/Cityscapes/*

In [11]:
paths = dict()
for i in ['data', 'labels']:
    paths[i] = dict()
    for j in ['train','val','test']:
        paths[i][j] = os.path.join(os.environ['PROC_DATAPATH'],i,j)
        os.makedirs(paths[i][j])

## Copy & Modify 

We'll copy the pngs from `$RAW_DATAPATH` and change their file name. Additionally, we'll convert the labels to 20 classes.

In [17]:
for i in tqdm.tqdm(['train', 'val', 'test']):
    print(f'\nPreprocessing data and labels for fold: {i}')
    img_dir = os.path.join(os.environ['RAW_DATAPATH'], 'leftImg8bit', i)
    label_dir = os.path.join(os.environ['RAW_DATAPATH'],'gtFine', i)
    
    for city_dir in tqdm.tqdm(eval(i + '_dirs')):
        img_dir_path = os.path.join(img_dir, city_dir)
        label_dir_path = os.path.join(label_dir, city_dir)
        file_names = os.listdir(img_dir_path)

        for fn in file_names:
            img_id = fn.split("_leftImg8bit.png")[0]
            
            # copy data
            inpath_image = os.path.join(img_dir_path, fn)
            outpath_image = os.path.join(paths['data'][i], img_id + '.png')
            shutil.copy(inpath_image, outpath_image)  # no meta-data copied
            
            # read-in, change labels, write-out
            gtFine_img_path = os.path.join(label_dir_path, img_id + "_gtFine_labelIds.png")
            gtFine_img = Image.open(gtFine_img_path)
            label_img = id_to_trainId_map_func(gtFine_img)
            label_img = Image.fromarray(label_img.astype(np.uint8))
            out_path_label = os.path.join(paths['labels'][i], img_id + '.png')
            label_img.save(out_path_label)
              

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/18 [00:00<?, ?it/s][A


Preprocessing data and labels for fold: train



  6%|▌         | 1/18 [00:31<08:56, 31.54s/it][A
 11%|█         | 2/18 [01:02<08:20, 31.30s/it][A
 17%|█▋        | 3/18 [01:38<08:11, 32.79s/it][A
 22%|██▏       | 4/18 [02:02<07:01, 30.14s/it][A
 28%|██▊       | 5/18 [02:39<06:57, 32.14s/it][A
 33%|███▎      | 6/18 [03:28<07:27, 37.27s/it][A
 39%|███▉      | 7/18 [05:00<09:50, 53.70s/it][A
 44%|████▍     | 8/18 [05:23<07:26, 44.61s/it][A
 50%|█████     | 9/18 [05:48<05:48, 38.70s/it][A
 56%|█████▌    | 10/18 [06:39<05:37, 42.15s/it][A
 61%|██████    | 11/18 [07:42<05:39, 48.56s/it][A
 67%|██████▋   | 12/18 [08:10<04:13, 42.23s/it][A
 72%|███████▏  | 13/18 [09:08<03:55, 47.05s/it][A
 78%|███████▊  | 14/18 [09:30<02:38, 39.65s/it][A
 83%|████████▎ | 15/18 [10:10<01:59, 39.73s/it][A
 89%|████████▉ | 16/18 [11:32<01:44, 52.49s/it][A
 94%|█████████▍| 17/18 [11:57<00:44, 44.03s/it][A
100%|██████████| 18/18 [12:42<00:00, 42.34s/it][A
 33%|███▎      | 1/3 [12:42<25:24, 762.12s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A


Preprocessing data and labels for fold: val



 33%|███▎      | 1/3 [01:14<02:29, 74.60s/it][A
 67%|██████▋   | 2/3 [02:07<01:08, 68.22s/it][A
100%|██████████| 3/3 [02:25<00:00, 48.38s/it][A
 67%|██████▋   | 2/3 [15:07<09:37, 577.03s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A


Preprocessing data and labels for fold: test



 17%|█▋        | 1/6 [02:47<13:55, 167.18s/it][A
 33%|███▎      | 2/6 [03:39<08:51, 132.85s/it][A
 50%|█████     | 3/6 [03:56<04:54, 98.09s/it] [A
 67%|██████▋   | 4/6 [04:13<02:27, 73.52s/it][A
 83%|████████▎ | 5/6 [05:39<01:17, 77.49s/it][A
100%|██████████| 6/6 [07:40<00:00, 76.67s/it][A
100%|██████████| 3/3 [22:47<00:00, 455.77s/it]


carme_notes.txt  [0m[01;34mData[0m/                        [01;34mProjects[0m/
cookies.txt      preprocess_cityscapes.ipynb  [01;34mResults[0m/
