# Build a dataset with the flickr API

API reference: https://www.flickr.com/services/api/ <br>
Python API: https://github.com/sybrenstuvel/flickrapi <br>
API allows for 3600 calls per hour.

The quickest way to create a hand-picked dataset is probably to download the pictures manually and use the ids_from_files() function bellow to get the image ids from the file names. Alternatively, create a gallery directly on flickr and get your ids over the API from there (the problem here is that some creators do not allow the adding to galleries, so it won't cover all pictures).

In [1]:
from configs import cfg
import os
from pathlib import Path
import glob
import json
from pprint import pprint as _pprint
def pprint(data): (_pprint(data, sort_dicts=False))
from PIL import Image
import urllib.request
import flickrapi
from tqdm.auto import tqdm

KEY = cfg.CRED.FLICKR_KEY
SECRET = cfg.CRED.FLICKR_SECRET
flickr = flickrapi.FlickrAPI(KEY, SECRET)

In [2]:
# build constants for annotation file

# dataset info; json key is 'info'
INFO = {
    "description": "Dataset",
    "url": "https://github.com/random9v2/cv-annot",
    "version": "1.0",
    "year": 2022,
    "contributor": "John",
    "date_created": "2022/08/10"
}

# get flickr licenses, order, remove no CC license ('id':0); json key is 'licenses'
LICENSES = sorted(json.loads(flickr.photos.licenses.getInfo(format="json"))["licenses"]["license"], key=lambda d: d['id'])[1:]

# original COCO categories for human keypoints; json key is 'categories'
CATEGORIES_COCO = [
        {
            "supercategory": "person",
            "id": 1,
            "name": "person",
            "keypoints": [ # keypoint keys
                "nose",
                "left_eye",
                "right_eye",
                "left_ear",
                "right_ear",
                "left_shoulder",
                "right_shoulder",
                "left_elbow",
                "right_elbow",
                "left_wrist",
                "right_wrist",
                "left_hip",
                "right_hip",
                "left_knee",
                "right_knee",
                "left_ankle",
                "right_ankle"
            ],
            "skeleton": [ # how keypoints are connected for visualization; does not affect training
                [16,14],
                [14,12],
                [17,15],
                [15,13],
                [12,13],
                [6,12],
                [7,13],
                [6,7],
                [6,8],
                [7,9],
                [8,10],
                [9,11],
                [2,3],
                [1,2],
                [1,3],
                [2,4],
                [3,5],
                [4,6],
                [5,7]
            ]
        }
    ]

# original crowd_pose categories for human keypoints; json key is 'categories'
CATEGORIES_CROWD = [
        {
            "supercategory": "person",
            "id": 1,
            "name": "person",
            "keypoints": [
                "left_shoulder",
                "right_shoulder",
                "left_elbow",
                "right_elbow",
                "left_wrist",
                "right_wrist",
                "left_hip",
                "right_hip",
                "left_knee",
                "right_knee",
                "left_ankle",
                "right_ankle",
                "head",
                "neck"
            ],
            "skeleton": [
                [1,14],
                [1,3],
                [2,14],
                [2,4],
                [3,5],
                [4,6],
                [7,14],
                [7,9],
                [8,14],
                [8,10],
                [9,11],
                [10,12],
                [13,14]
            ]
        }
    ]


# with open(f"configs/crowdpose_keypoint_cat.json", 'w', encoding='utf-8') as f:
#         json.dump({"categories": CATEGORIES_CROWD}, f, ensure_ascii=False, indent=4)


In [3]:
# pprint(LICENSES)

In [5]:
# build/add to dataset
# with big datasets (> 1800), the function must be scheduled to not exeed the hourly API quota (easiest is a sleep() between each image)

ID_PREFIX = '007'  # file name and ID prefix for this dataset (only decorative/helps differentiate sets)
CATEGORIES = CATEGORIES_CROWD  # coco or crowd_pose

def build_dataset(flickr_ids:list, img_dir:str, anno_file:str=None, update:dict=None):
    """
    Builds dataset with annotation file out of flickr image IDs.
    New images will be added to a given annotation file. Old annotations won't be altered.
    
    args:
    flickr_ids: list of flickr image ids
    img_dir: path to download the new images into (dataset path)
    anno_file (optional): annotation file to add new images to
    update / NOT IMPLEMENTED: update part of the annotation file, e.g. {"info": {}, "licenses": {}}; due to safety, updating images/annotations should be disabled
    """    
    final_anno = {}
    max_id = max_count_id = 0
    flickr_ids_add = set(flickr_ids)  # add to anno
    flickr_ids_download = set() # download
    version = '1.0'
    changes = False
    
    if anno_file:
        with open(anno_file) as f:
            anno_json = json.load(f)
            
        # copy of original
        original_anno = anno_json.copy()
        
        # increase version number if images are added
        version = f'{(int(float(anno_json["info"]["version"])) + 1):.1f}'
    
        # only add new flickr images
        ids_in_json = [(image["flickr_id"] if image.get("flickr_id", False) else '') for image in anno_json["images"]]
        flickr_ids_add = set(flickr_ids) - set(ids_in_json)
        
        # for downloading what isn't there
        imgs_in_dir = set([x.name for x in Path(img_dir).glob("*.jpg")])
        for image in anno_json["images"]:
            if image["file_name"] not in imgs_in_dir:
                flickr_ids_download.add(image["flickr_id"])
        
        # get biggest dataset id
        max_id = max(anno_json["images"], key=lambda x:x["id"])["id"]
        max_count_id = int(str(max_id)[1:])
        
        final_anno = anno_json
    else:
        # build starter annotation file with header
        final_anno["info"] = INFO
        final_anno["licenses"] = LICENSES
        final_anno["categories"] = CATEGORIES
        final_anno["images"] = []
            
    # build image meta for all new ids
    image_meta = []
    allowed_licenses = [x["id"] for x in final_anno["licenses"]]
    count_id = 1 if max_count_id==0 else max_count_id+1000  # keep dataset image ids between versions appart
    
    pbar = tqdm(flickr_ids_add)
    for flickr_id in pbar:
        file_name = f'{ID_PREFIX}{str(count_id).zfill(9)}.jpg'
        response = json.loads(flickr.photos.getInfo(photo_id=flickr_id, format="json"))
        if response.get("photo", True):
            if int(response["photo"]["license"]) in allowed_licenses:
                url = get_url(flickr_id, 'z')
                download_image(url, img_dir, file_name)
                remove_metadata_img(Path(img_dir, file_name), quality=98)
                height, width = Image.open(Path(img_dir, file_name)).size
                image_meta.append(
                    {
                        "license": response["photo"]["license"],
                        "file_name": file_name,
                        "dataset_version": version,
                        "height": height,
                        "width": width,
                        "date_captured": response["photo"]["dates"]["taken"],
                        "flickr_url": url,
                        "content_url": response["photo"]["urls"]["url"][0]["_content"],
                        "flickr_id": response["photo"]["id"],
                        "id": int(f'{ID_PREFIX}{str(count_id).zfill(9)}')
                    }
                )
            else:
                print(f'License Error: image {flickr_id} not added; license {response["photo"]["license"]} not allowed.')
        else:
            print(f'Error for flickr_id{flickr_id}: json response does not contain a photo item \n json response: \n')
            pprint(response)   
        count_id+=1
        pbar.set_description(f'download+annotate: image: {flickr_id}')
        
    changes = True if len(image_meta) > 0 else False
    final_anno["images"] += image_meta

    # download remaining
    if len(flickr_ids_download) > 0:
        pbar_2 = tqdm(flickr_ids_download) 
        for flickr_id in pbar_2:
            file_name = next((item for item in final_anno["images"] if item["flickr_id"] == flickr_id), None)["file_name"]
            url = get_url(flickr_id, 'z')
            download_image(url, img_dir, file_name)
            remove_metadata_img(Path(img_dir, file_name))
            pbar_2.set_description(f'download: image: {flickr_id}')
    
    # finish and save annotation file
    final_anno["info"]["version"] = version
    save_file = f'annotationMeta_v{version}.json'
    if anno_file:
        if changes:
            save_path = Path(Path(anno_file).parent, save_file)
            if save_path.is_file():  
                print(f'Error: {save_file} already exists.')
            else:
                with open(save_path, 'w', encoding='utf-8') as f:
                    json.dump(final_anno, f, ensure_ascii=False, indent=4)
        else:
            print('Warning: annotation file says: nothing changed')
    else:
        save_path = Path(img_dir, save_file)
        if save_path.is_file():  
            print(f'Error: {save_file} already exists in image directory.')
        else:
            with open(save_path, 'w', encoding='utf-8') as f:
                json.dump(final_anno, f, ensure_ascii=False, indent=4)
    

## helper functions

def get_url(id:str, size_suffix:str=''):
    """Gets URL for given image id and size.

    args:
    id: flickr image id
    size: defines image size with suffix: https://www.flickr.com/services/api/misc.urls.html
    """
    get_sizes = json.loads(flickr.photos.getSizes(photo_id=id, format="json"))
    found = False
    last_size_url = None
    for size in get_sizes['sizes']['size']:
        last_size_url = size['source']
        _size_suffix = size['source'].split('_')[-1].split('.')[0]  # gets (size) suffix from image source link
        if _size_suffix == size_suffix:
            url = size['source']
            found = True
            break
        elif len(_size_suffix) > 1 and size_suffix == '':  # standard format without suffix in link
            url = size['source']
            found = True
            break
    if not found:   
        url = last_size_url
        suffix = last_size_url.split('_')[-1].split('.')[0]
        print(f'Link/size not found for {id}; biggest image URL available saved, suffix: {suffix if len(suffix)==1 else "none (default)"}')
    return url

def download_image(url:str, output_dir:str, file_name:str):
    """Donwloads image from URL.
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    urllib.request.urlretrieve(url, Path(output_dir, file_name))
    
def remove_metadata_img(img_path:str, quality=75):
    """Removes all metadata of an image. Will also compress the image (PIL default is quality=75).
    """
    image = Image.open(img_path)
    data = list(image.getdata())
    img_no_meta = Image.new(image.mode, image.size)
    img_no_meta.putdata(data)
    img_no_meta.save(img_path, quality=quality)  # default quality=75

def ids_from_files(dir):
    """Retrieve flickr IDs from the file name of downloaded images.
    
    flickr file name format: numericalID_imageSecret_imageSize.jpg, e.g. 50235154168_b3201cd930_o.jpg
    """
    return set([x.name.split('_')[0] for x in Path(dir).glob("*.jpg")])


# ids = ids_from_files('/Users/john/Downloads/fireground_dataset_raw')
# test_ids = ['4882517696', '4882517696']
# build_dataset(ids, '/Users/john/Downloads/fireground_dataset', None)


In [6]:
# After annotating in coco-annotator, add the annotations to the annotation file created above

# keys in annotations to be removed
ANNO_DELETE = ["color", "metadata"]


def add_coco_annotations(anno_meta, anno_coco):
    """
    Adds in external tools created annotations to the annotation file created above. 
    The file names must remain the same during the whole process.
    Image ids are drawn from the anno_meta file.
    
    args:
    anno_meta: path to .json file – contains meta information for a dataset, e.g., created above with build dataset
    anno_coco: path to .json file – annotation export from coco-annotator
    """
    with open(anno_meta) as f:
        meta_json = json.load(f)
    with open(anno_coco) as f:
        coco_json = json.load(f)
        
    # build lookup tables with file names and image ids
    key_fn_id_meta = {}
    key_id_fn_coco = {}
    for image in meta_json["images"]:
        key_fn_id_meta[image["file_name"]] = image["id"]
    for image in coco_json["images"]:
        key_id_fn_coco[image["id"]] = image["file_name"]
    # keep category id from meta_json
    # category name has to be the same
    key_name_cat_meta = {}
    key_cat_name_coco = {}
    for category in meta_json["categories"]:
        key_name_cat_meta[category["name"]] = category["id"]
    for category in coco_json["categories"]:
        key_cat_name_coco[category["id"]] = category["name"]
        
    annotations = []
    for anno in coco_json["annotations"]:
        file_name = key_id_fn_coco[anno["image_id"]]
        id_meta = key_fn_id_meta[file_name]
        category_name = key_cat_name_coco[anno["category_id"]]
        category_id_meta = key_name_cat_meta[category_name]
        
        # annotation items to keep
        annotation = {}
        for key, value in anno.items():
            if key not in ANNO_DELETE:
                annotation[key] = value
        
        # overwrite id keys with anno_meta values
        annotation["id"] = anno["id"]
        annotation["image_id"] = id_meta
        annotation["category_id"] = category_id_meta
        
        # finally add new annotation
        annotations.append(annotation)
        
    # merge and save
    final_anno = meta_json.copy()
    final_anno["annotations"] = annotations
    version = meta_json["info"]["version"]
    save_file = f'annotations_v{version}.json'
    save_path = Path(Path(anno_meta).parent, save_file)
    if save_path.is_file():  
        print(f'Error: {save_file} already exists.')
    else:
        with open(save_path, 'w', encoding='utf-8') as f:
            json.dump(final_anno, f, ensure_ascii=False, indent=4)

            
# add_coco_annotations('/Users/john/Downloads/test/annotationMeta_v1.0.json', '/Users/john/Downloads/test/test-4.json')

# some useful functions

In [7]:
def remove_key(container:dict):
    """Drop dict key&value on all levels; e.g., segmentations in an annotation file.
    """
    if not isinstance(container, dict):
        return container if not isinstance(container, list) else list(map(d_rem, container))
    return {a:d_rem(b) for a, b in container.items() if a != 'segmentation'}  # adjust key


json_path = '/Users/john/git/_tools/coco-annotator/datasets/annotations_coco_person/coco_person_keypoints_val2017_segDel.json'
save_path = '/Users/john/git/_tools/coco-annotator/datasets/annotations_coco_person/coco_person_keypoints_val2017_segDel2.json'

# with open(json_path) as f:
#     b = remove_key(json.load(f))
# with open(save_path, 'w', encoding='utf-8') as f:
#     json.dump(b, f, ensure_ascii=False, indent=4)
