## Convert GPS coords to State/Region
This notebook matches images from the Semantic Segmentation dataset from BDD100K with the object detection part of the dataset. In short, the object detection dataset consist of 70'000 images, whereas the semantic segmentation dataset consists of 10'000 images. In BDD100K's dokumentation, they explicitly state that the semantic segmentation dataset is not a subset of the object detection dataset. 

Each sample in the Object detection dataset comes with additional information (such as GPS-coordinates) in a JSON-format. This information is not available for available for the semantic segmentation dataset. However, in this work **we run under the assumption that if the randomized file name matches in both dataset**, we can use the GPS-information from the object detection dataset as basis to the semantic segmentation dataset. We take the gps-coordinates and find from which state the image is located in by utilizing the Nominatim method from Geopys Geocoder. 

To run this notebook, you need to change the location to the different datasets at multiple locations.

In [9]:
# import module
from geopy.geocoders import Nominatim
import os 
from tqdm.notebook import tqdm

In [2]:
BDD10K_images_path = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/train'
BDD10K_images = os.listdir(BDD10K_images_path)
img_ids = [img.split('-')[0] for img in BDD10K_images]

In [3]:
BDD100K_info_path_train = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/train'
BDD100K_info_train = os.listdir(BDD100K_info_path_train)
info_ids_train = [info.split('-')[0] for info in BDD100K_info_train]

BDD100K_info_path_val = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/val'
BDD100K_info_val = os.listdir(BDD100K_info_path_val)
info_ids_val = [info.split('-')[0] for info in BDD100K_info_val]


In [4]:
print(len(img_ids))
print(len(info_ids_train))
print(len(info_ids_val))

7000
70000
10000


In [6]:
found = 0 
in_val = 0 

for img in tqdm(img_ids): 
    if img in info_ids_train:
        found += 1 
    elif img in info_ids_val: 
        in_val += 1 
print(found, in_val)

  0%|          | 0/7000 [00:00<?, ?it/s]

4097 454


In [7]:
BDD10K_images_path_validation = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/val'
BDD10K_images_val = os.listdir(BDD10K_images_path_validation)
img_ids_val = [img.split('-')[0] for img in BDD10K_images_val]

In [8]:
val_found = 0 
val_in_val = 0 

for img in tqdm(img_ids_val): 
    if img in info_ids_train:
        val_found += 1 
    elif img in info_ids_val: 
        val_in_val += 1 
print(val_found, val_in_val)

  0%|          | 0/1000 [00:00<?, ?it/s]

342 0


In [5]:
import json

In [6]:
f = open("C:/Users/s23243/Downloads/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_train.json")
data = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/s23243/Downloads/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_train.json'

In [58]:
data[0]['labels'][0]

{'category': 'traffic light',
 'attributes': {'occluded': False,
  'truncated': False,
  'trafficLightColor': 'green'},
 'manualShape': True,
 'manualAttributes': True,
 'box2d': {'x1': 1125.902264,
  'y1': 133.184488,
  'x2': 1156.978645,
  'y2': 210.875445},
 'id': 0}

In [30]:
import shutil
from geopy.geocoders import Nominatim
from IPython.display import clear_output
import time 

geolocator = Nominatim(user_agent='bdd100kdata')

BDD10K_images_path = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/train'
BDD10K_images = os.listdir(BDD10K_images_path)
BDD10K_images = [b.replace('.jpg', '') for b in BDD10K_images]
img_ids = [img.split('-')[0] for img in BDD10K_images]

BDD10K_images_path_val = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/val'
BDD10K_images_val = os.listdir(BDD10K_images_path_val)
BDD10K_images_val = [b.replace('.jpg', '') for b in BDD10K_images_val]
img_ids_val = [img.split('-')[0] for img in BDD10K_images_val]

BDD100K_images_path = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/train'
BDD100K_images = os.listdir(BDD100K_images_path)
BDD100K_images = [b.replace('.jpg', '') for b in BDD100K_images]
img_ids_100 = [img.split('-')[0] for img in BDD100K_images]

BDD100K_info_path_train = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/train'
BDD100K_info_train = os.listdir(BDD100K_info_path_train)
BDD100K_info_train = [b.replace('.json', '') for b in BDD100K_info_train]
info_ids_train = [info.split('-')[0] for info in BDD100K_info_train]


BDD100K_images_path_val = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/val'
BDD100K_images_val = os.listdir(BDD100K_images_path_val)
BDD100K_images_val = [b.replace('.jpg', '') for b in BDD100K_images_val]
img_ids_100_val = [img.split('-')[0] for img in BDD100K_images_val]

BDD100K_info_path_val = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/val'
BDD100K_info_val = os.listdir(BDD100K_info_path_val)
BDD100K_info_val = [b.replace('.json', '') for b in BDD100K_info_val]
info_ids_val = [info.split('-')[0] for info in BDD100K_info_val]

In [56]:
if not os.path.isdir('bdd10k'):
    os.mkdir('bdd10k')
    os.mkdir('bdd10k/images')
    os.mkdir('bdd10k/images/train')
    os.mkdir('bdd10k/images/val')
    
    os.mkdir('bdd10k/labels')
    os.mkdir('bdd10k/labels/train')
    os.mkdir('bdd10k/labels/val')

In [59]:
def convert_bdd10k_to_city_folders(
        bdd10k_subset_folder, 
        bdd100k_image_folder,
        bdd100k_json_info_folder, 
        bdd10k_label_folder, 
        save_folder
    ): 
    BDD10K_images = os.listdir(bdd10k_subset_folder)
    BDD10K_images = [b.replace('.jpg', '') for b in BDD10K_images]
    
    BDD100K_images = os.listdir(bdd100k_image_folder)
    BDD100K_images = [b.replace('.jpg', '') for b in BDD100K_images]
    
    BDD100K_json_files = os.listdir(bdd100k_json_info_folder)
    BDD100K_json_files = [b.replace('.json', '') for b in BDD100K_json_files]
    
    label_save_folder = save_folder.replace('images', 'labels')

    ctr = 0 
    # loop through all images in the Subset (10K subset)
    for img_idx, img in enumerate(tqdm(BDD10K_images)): 

        # Check if the image is found in BDD100K set. You need to do this twice: once for Val and once for Train. 
        if img in BDD100K_images:
            # Image is found in training folder, lets get the corresponding location info from JSON-file. 
            idx = BDD100K_json_files.index(img)
            jd = json.load(open(os.path.join(bdd100k_json_info_folder, BDD100K_json_files[idx] + '.json')))
            
            
            # Extract Longitude and Latitude coordinates
            json_location = jd['locations']
            if json_location == []: 
                continue 
            
            json_location = json_location[0]  # First timestep, disregard rest. 

            latitude = str(json_location['latitude']) 
            longitude = str(json_location['longitude'])

            # Convert Long/Lat to City Name 
            location = geolocator.reverse(latitude+","+longitude)
            city = location.raw['address'].get('state')                

            # Copy image to new folder 
            city_path = os.path.join(save_folder, city)
            if not os.path.isdir(city_path):
                os.mkdir(city_path)
            
            shutil.copyfile(os.path.join(bdd10k_subset_folder, img + '.jpg') , os.path.join(city_path, img + '.jpg'))

            # Copy label file to new folder 
            label_path = os.path.join(label_save_folder, city)
            if not os.path.isdir(label_path): 
                os.mkdir(label_path)

            label_name = os.path.join(label_path, img)

            colormap_label_path = os.path.join(bdd10k_label_folder, img + '.png')
            mask_label_path = colormap_label_path.replace('colormaps', 'masks')

            shutil.copyfile(colormap_label_path, label_name + '-colormap.png')
            shutil.copyfile(mask_label_path, label_name + '-mask.png')

            time.sleep(0.1)  # Needed to not overload Neonatim. Dont know why. 
            ctr += 1 
            #if ctr > 10:
            #    break 
    print("Converted {} images".format(str(ctr)))

## Test the four different comibinations:
Code below will match the four different combinations of Train and Validation sets for the two subsets.

In [60]:
#      |
#  TT  |  
#------+-----
#      |
#      |

BDD10K_image_train_folder = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/train'
BDD100K_image_train_folder = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/train'
BDD100K_json_files_train_folder = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/train'
BDD10K_segm_train_label_folder = 'C:/Users/s23243/Downloads/bdd100k_sem_seg_labels_trainval/bdd100k/labels/sem_seg/colormaps/train'
save_folder = './bdd10k/images/train'

convert_bdd10k_to_city_folders(
    bdd10k_subset_folder=BDD10K_image_train_folder,
        bdd100k_image_folder=BDD100K_image_train_folder,
        bdd100k_json_info_folder=BDD100K_json_files_train_folder, 
        bdd10k_label_folder=BDD10K_segm_train_label_folder,
        save_folder=save_folder 
)

  0%|          | 0/7000 [00:00<?, ?it/s]

Converted 2970 images


In [62]:
#      |
#      |  TV
#------+-----
#      |
#      |

BDD10K_image_train_folder = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/train'
BDD100K_image_val_folder = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/val'
BDD100K_json_files_val_folder = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/val'
BDD10K_segm_train_label_folder = 'C:/Users/s23243/Downloads/bdd100k_sem_seg_labels_trainval/bdd100k/labels/sem_seg/colormaps/train'
save_folder = './bdd10k/images/train'

convert_bdd10k_to_city_folders(
    bdd10k_subset_folder=BDD10K_image_train_folder,
        bdd100k_image_folder=BDD100K_image_val_folder,
        bdd100k_json_info_folder=BDD100K_json_files_val_folder, 
        bdd10k_label_folder=BDD10K_segm_train_label_folder,
        save_folder=save_folder 
)

  0%|          | 0/7000 [00:00<?, ?it/s]

Converted 454 images


In [63]:
#      |
#      |  
#------+-----
#  VT  |
#      |

BDD10K_image_val_folder = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/val'
BDD100K_image_train_folder = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/train'
BDD100K_json_files_train_folder = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/train'
BDD10K_segm_val_label_folder = 'C:/Users/s23243/Downloads/bdd100k_sem_seg_labels_trainval/bdd100k/labels/sem_seg/colormaps/val'
save_folder = './bdd10k/labels/val'

convert_bdd10k_to_city_folders(
    bdd10k_subset_folder=BDD10K_image_val_folder,
        bdd100k_image_folder=BDD100K_image_train_folder,
        bdd100k_json_info_folder=BDD100K_json_files_train_folder, 
        bdd10k_label_folder=BDD10K_segm_val_label_folder,
        save_folder=save_folder 
)

  0%|          | 0/1000 [00:00<?, ?it/s]

Converted 0 images


In [64]:
#      |
#      |  
#------+-----
#      |  VV
#      |

BDD10K_image_val_folder = 'C:/Users/s23243/Downloads/bdd100k_images_10k/bdd100k/images/10k/val'
BDD100K_image_val_folder = 'C:/Users/s23243/Downloads/bdd100k_images_100k/bdd100k/images/100k/val'
BDD100K_json_files_val_folder = 'C:/Users/s23243/Downloads/bdd100k_info/bdd100k/info/100k/val'
BDD10K_segm_val_label_folder = 'C:/Users/s23243/Downloads/bdd100k_sem_seg_labels_trainval/bdd100k/labels/sem_seg/colormaps/val'
save_folder = './bdd10k/labels/val'

convert_bdd10k_to_city_folders(
    bdd10k_subset_folder=BDD10K_image_val_folder,
        bdd100k_image_folder=BDD100K_image_val_folder,
        bdd100k_json_info_folder=BDD100K_json_files_val_folder, 
        bdd10k_label_folder=BDD10K_segm_val_label_folder,
        save_folder=save_folder 
)

  0%|          | 0/1000 [00:00<?, ?it/s]

Converted 0 images


In [65]:
# Lets see the final amount of images!
rootdir = './bdd10k/images/train'
total = 0
for root, subdirs, files in tqdm(os.walk(rootdir)):
    for s in subdirs: 
        amount = len(os.listdir(os.path.join(root,s)))
        print(s, amount)
        total += amount
print('total: {}'.format(total))
   

0it [00:00, ?it/s]

California 491
Connecticut 4
Florida 1
Georgia 2
Nevada 1
New Jersey 132
New Mexico 2
New York 2428
Texas 1
יהודה ושומרון 2
מחוז הדרום 25
מחוז המרכז 78
מחוז הצפון 4
מחוז חיפה 14
מחוז ירושלים 28
מחוז תל אביב 211
total: 3424


In [66]:
k10_img = os.listdir(BDD10K_image_val_folder)
k100_img = os.listdir(BDD100K_image_train_folder)

total = 0 
for img in tqdm(k10_img): 
    if img in k100_img: 
        total += 1 
print(total)

  0%|          | 0/1000 [00:00<?, ?it/s]

0


In [None]:
for root, subdirs, files in os.walk(rootdir):
  for s in subdirs: 
    amount = len(os.listdir(os.path.join(root,s)))
    print(s, amount)
    total += amount

print('total: {}'.format(total))