# Creating train/val/test split for cityscapes dataset

### Final Datasets:
(Created by combining training and validation datasets from cityscapes)
- Train: 2475 images
- Validation: 500 images
- Test: 500 images 


### Original Datasets:
Train Dataset:
- Total number of input images: 2975
- Total number of people saved as cropped output images: 8191
- Total number of people annotations in input data: 16526
- Percent people included: 0.5

Validation Dataset:
- 500 Images

Cityscapes Test Dataset:
- 1525 Images
- Bounding Box annotations not provided

In [1]:
COLAB = False

import os
import cv2
import numpy as np
import pickle

from PIL import Image
from IPython.display import display

if COLAB:
    from google.colab.patches import cv2_imshow


IMG_WIDTH = 2048
IMG_HEIGHT = 1024

---

In [3]:
all_images = []
ignore_directories = ['.DS_Store', 'demo', 'test']
for directory in os.listdir('images/'):
    if directory not in ignore_directories:
        print(directory)
        images = os.listdir('images/'+directory)
        if '.DS_Store' in images:
            images.remove('.DS_Store')
#         print(len(images))
        all_images.extend(images)

zurich
strasbourg
weimar
munster
aachen
tubingen
jena
bochum
darmstadt
dusseldorf
hamburg
cologne
lindau
monchengladbach
frankfurt
krefeld
ulm
hanover
stuttgart
erfurt
bremen


In [44]:
city_counts_all = {}

for filename in all_images:
    city = filename[:-30]
    if city in city_counts_all:
        city_counts_all[city] += 1
    else:
        city_counts_all[city] = 1

print(city_counts_all)

{'zurich': 122, 'strasbourg': 365, 'weimar': 142, 'munster': 174, 'aachen': 174, 'tubingen': 144, 'jena': 119, 'bochum': 96, 'darmstadt': 85, 'dusseldorf': 221, 'hamburg': 248, 'cologne': 154, 'lindau': 59, 'monchengladbach': 94, 'frankfurt': 267, 'krefeld': 99, 'ulm': 95, 'hanover': 196, 'stuttgart': 196, 'erfurt': 109, 'bremen': 316}


{'zurich': 122, 'strasbourg': 365, 'weimar': 142, 'munster': 174, 'aachen': 174, 'tubingen': 144, 'jena': 119, 'bochum': 96, 'darmstadt': 85, 'dusseldorf': 221, 'hamburg': 248, 'cologne': 154, 'lindau': 59, 'monchengladbach': 94, 'frankfurt': 267, 'krefeld': 99, 'ulm': 95, 'hanover': 196, 'stuttgart': 196, 'erfurt': 109, 'bremen': 316}

In [35]:
len(all_images)

3475

In [106]:
# Get city from filename
a='krefeld_000000_012505_leftImg8bit.png'
a[:-30]

'krefeld'

## Create Test Set of 500 images

In [105]:
test_images = np.random.choice(all_images, 500, replace=False)
len(test_images)

500

In [118]:
# print(test_images)

city_counts_test = {}

for filename in test_images:
    city = filename[:-30]
    if city in city_counts_test:
        city_counts_test[city] += 1
    else:
        city_counts_test[city] = 1

print(city_counts_test)

{'cologne': 34, 'erfurt': 16, 'tubingen': 19, 'hamburg': 32, 'stuttgart': 30, 'zurich': 14, 'jena': 16, 'strasbourg': 61, 'munster': 28, 'dusseldorf': 28, 'bremen': 42, 'monchengladbach': 10, 'darmstadt': 10, 'hanover': 25, 'lindau': 9, 'ulm': 19, 'frankfurt': 32, 'weimar': 24, 'krefeld': 15, 'aachen': 23, 'bochum': 13}


{'cologne': 34, 'erfurt': 16, 'tubingen': 19, 'hamburg': 32, 'stuttgart': 30, 'zurich': 14, 'jena': 16, 'strasbourg': 61, 'munster': 28, 'dusseldorf': 28, 'bremen': 42, 'monchengladbach': 10, 'darmstadt': 10, 'hanover': 25, 'lindau': 9, 'ulm': 19, 'frankfurt': 32, 'weimar': 24, 'krefeld': 15, 'aachen': 23, 'bochum': 13}

In [109]:
city_perc_test = {}
for city in city_counts_test.keys():
    city_perc_test[city] = round(city_counts_test[city]/city_counts_all[city],2)
    
print(city_perc_test)

{'cologne': 0.22, 'erfurt': 0.15, 'tubingen': 0.13, 'hamburg': 0.13, 'stuttgart': 0.15, 'zurich': 0.11, 'jena': 0.13, 'strasbourg': 0.17, 'munster': 0.16, 'dusseldorf': 0.13, 'bremen': 0.13, 'monchengladbach': 0.11, 'darmstadt': 0.12, 'hanover': 0.13, 'lindau': 0.15, 'ulm': 0.2, 'frankfurt': 0.12, 'weimar': 0.17, 'krefeld': 0.15, 'aachen': 0.13, 'bochum': 0.14}


{'cologne': 0.22, 'erfurt': 0.15, 'tubingen': 0.13, 'hamburg': 0.13, 'stuttgart': 0.15, 'zurich': 0.11, 'jena': 0.13, 'strasbourg': 0.17, 'munster': 0.16, 'dusseldorf': 0.13, 'bremen': 0.13, 'monchengladbach': 0.11, 'darmstadt': 0.12, 'hanover': 0.13, 'lindau': 0.15, 'ulm': 0.2, 'frankfurt': 0.12, 'weimar': 0.17, 'krefeld': 0.15, 'aachen': 0.13, 'bochum': 0.14}

In [114]:
### Save Test Image Filenames

# with open('test_images_v.pkl', 'wb') as f:
#     pickle.dump(test_images, f)

In [115]:
with open('test_images.pkl', 'rb') as f:
    saved_test_images = pickle.load(f)

## Create Validation Dataset of 500 images

In [65]:
len(all_images)

3475

In [116]:
nontest_images = list(set(all_images) - set(test_images))
len(nontest_images)

2975

In [120]:
val_images = np.random.choice(nontest_images, 500, replace=False)
len(val_images)

500

In [121]:
city_counts_val = {}

for filename in val_images:
    city = filename[:-30]
    if city in city_counts_val:
        city_counts_val[city] += 1
    else:
        city_counts_val[city] = 1

print(city_counts_val)

{'stuttgart': 32, 'krefeld': 13, 'jena': 13, 'dusseldorf': 27, 'frankfurt': 42, 'ulm': 18, 'monchengladbach': 15, 'lindau': 7, 'strasbourg': 51, 'weimar': 21, 'bremen': 43, 'erfurt': 19, 'hamburg': 39, 'tubingen': 26, 'hanover': 20, 'munster': 20, 'cologne': 25, 'zurich': 19, 'aachen': 23, 'bochum': 16, 'darmstadt': 11}


{'stuttgart': 32, 'krefeld': 13, 'jena': 13, 'dusseldorf': 27, 'frankfurt': 42, 'ulm': 18, 'monchengladbach': 15, 'lindau': 7, 'strasbourg': 51, 'weimar': 21, 'bremen': 43, 'erfurt': 19, 'hamburg': 39, 'tubingen': 26, 'hanover': 20, 'munster': 20, 'cologne': 25, 'zurich': 19, 'aachen': 23, 'bochum': 16, 'darmstadt': 11}

In [122]:
city_perc_val = {}
for city in city_counts_val.keys():
    city_perc_val[city] = round(city_counts_val[city]/city_counts_all[city],2)
    
print(city_perc_val)

{'stuttgart': 0.16, 'krefeld': 0.13, 'jena': 0.11, 'dusseldorf': 0.12, 'frankfurt': 0.16, 'ulm': 0.19, 'monchengladbach': 0.16, 'lindau': 0.12, 'strasbourg': 0.14, 'weimar': 0.15, 'bremen': 0.14, 'erfurt': 0.17, 'hamburg': 0.16, 'tubingen': 0.18, 'hanover': 0.1, 'munster': 0.11, 'cologne': 0.16, 'zurich': 0.16, 'aachen': 0.13, 'bochum': 0.17, 'darmstadt': 0.13}


{'stuttgart': 0.16, 'krefeld': 0.13, 'jena': 0.11, 'dusseldorf': 0.12, 'frankfurt': 0.16, 'ulm': 0.19, 'monchengladbach': 0.16, 'lindau': 0.12, 'strasbourg': 0.14, 'weimar': 0.15, 'bremen': 0.14, 'erfurt': 0.17, 'hamburg': 0.16, 'tubingen': 0.18, 'hanover': 0.1, 'munster': 0.11, 'cologne': 0.16, 'zurich': 0.16, 'aachen': 0.13, 'bochum': 0.17, 'darmstadt': 0.13}

In [123]:
# with open('val_images_v.pkl', 'wb') as f:
#     pickle.dump(val_images, f)

In [124]:
with open('val_images.pkl', 'rb') as f:
    saved_val_images = pickle.load(f)

In [127]:
l_val_images = list(saved_val_images)

In [128]:
set([x for x in l_val_images if l_val_images.count(x) > 1])


set()

## Create Training Dataset

In [150]:
train_images = list(set(all_images) - set(test_images) - set(val_images))
len(train_images)

2475

In [154]:
city_counts_train = {}

for filename in train_images:
    city = filename[:-30]
    if city in city_counts_train:
        city_counts_train[city] += 1
    else:
        city_counts_train[city] = 1

print(city_counts_train)

{'bremen': 231, 'dusseldorf': 166, 'bochum': 67, 'strasbourg': 253, 'lindau': 43, 'frankfurt': 193, 'erfurt': 74, 'zurich': 89, 'cologne': 95, 'hanover': 151, 'stuttgart': 134, 'darmstadt': 64, 'hamburg': 177, 'krefeld': 71, 'ulm': 58, 'jena': 90, 'aachen': 128, 'tubingen': 99, 'weimar': 97, 'monchengladbach': 69, 'munster': 126}


{'bremen': 231, 'dusseldorf': 166, 'bochum': 67, 'strasbourg': 253, 'lindau': 43, 'frankfurt': 193, 'erfurt': 74, 'zurich': 89, 'cologne': 95, 'hanover': 151, 'stuttgart': 134, 'darmstadt': 64, 'hamburg': 177, 'krefeld': 71, 'ulm': 58, 'jena': 90, 'aachen': 128, 'tubingen': 99, 'weimar': 97, 'monchengladbach': 69, 'munster': 126}m

In [155]:
city_perc_train = {}
for city in city_counts_train.keys():
    city_perc_train[city] = round(city_counts_train[city]/city_counts_all[city],2)
    
print(city_perc_train)

{'bremen': 0.73, 'dusseldorf': 0.75, 'bochum': 0.7, 'strasbourg': 0.69, 'lindau': 0.73, 'frankfurt': 0.72, 'erfurt': 0.68, 'zurich': 0.73, 'cologne': 0.62, 'hanover': 0.77, 'stuttgart': 0.68, 'darmstadt': 0.75, 'hamburg': 0.71, 'krefeld': 0.72, 'ulm': 0.61, 'jena': 0.76, 'aachen': 0.74, 'tubingen': 0.69, 'weimar': 0.68, 'monchengladbach': 0.73, 'munster': 0.72}


{'bremen': 0.73, 'dusseldorf': 0.75, 'bochum': 0.7, 'strasbourg': 0.69, 'lindau': 0.73, 'frankfurt': 0.72, 'erfurt': 0.68, 'zurich': 0.73, 'cologne': 0.62, 'hanover': 0.77, 'stuttgart': 0.68, 'darmstadt': 0.75, 'hamburg': 0.71, 'krefeld': 0.72, 'ulm': 0.61, 'jena': 0.76, 'aachen': 0.74, 'tubingen': 0.69, 'weimar': 0.68, 'monchengladbach': 0.73, 'munster': 0.72}

In [None]:
# with open('train_images_v.pkl', 'wb') as f:
#     pickle.dump(train_images, f)

In [None]:
with open('train_images.pkl', 'rb') as f:
    saved_train_images = pickle.load(f)

In [None]:
len(saved_train_images)