In [9]:
import requests
import pandas as pd
import bs4
import PIL.Image as Image
import collections
import os
from multiprocessing import Pool
import tqdm.notebook as tqdm
from datetime import timedelta, date

In [10]:
#Loading flickr key
with open("private/flickr-id.txt") as f:
    key = f.readlines()[0][:-1]

In [11]:
#testing the connection:
res = requests.get(f"https://www.flickr.com/services/rest/?method=flickr.photos.search&api_key={key}&tags=polytechnique&per_page=50&page=1&extras=url_o,o_dims,original_format,tags")
assert res.status_code == 200

In [12]:
year = 2016

## Downloading meta data in CSV

In [13]:
def get_range_pictures(tag, date_range):
    '''
    Will try to download all the metadata of the images 
    with tag in the date range.
    Metadata corresponds to information such as url, 
    size, tags, license ...
    '''
    min_date,max_date = date_range
    total = None
    count = 0
    d = collections.defaultdict(list)
    while total is None or (count < total and count < 5):
        res = requests.get(f"https://www.flickr.com/services/rest/?method=flickr.photos.search&api_key={key}&tags={tag}&per_page=500&page={count}&min_upload_date={min_date}&max_upload_date={max_date}&sort=date-posted-desc&extras=url_o,o_dims,original_format,tags,license")
        soup = bs4.BeautifulSoup(res.text)
        #return(soup,res)
        gen_infos = soup.find("photos")
        if total is None:
            total = int(gen_infos["pages"])
        photos_info = soup.find_all("photo")
        for image_info in photos_info:
            if "id" not in image_info.attrs:
                print("weird")
                return pd.DataFrame()
            d["id"].append(image_info["id"])
            d["server"].append(image_info["server"])
            d["secret"].append(image_info["secret"])
            d["height_o"].append(image_info["height_o"] if "height_o" in image_info.attrs else None)
            d["width_o"].append(image_info["width_o"] if "width_o" in image_info.attrs else None)
            d["tags"].append(image_info["tags"])
            d["license"].append(image_info["license"] if "license" in image_info.attrs else None)
            d["title"].append(image_info["title"])
            d["url_o"].append(image_info["url_o"] if "url_o" in image_info.attrs else None)
            d["owner"].append(image_info["owner"])
        count += 1
    dataset = pd.DataFrame(d)
    print(f"Found {total} pages, with {dataset['id'].nunique()} unique pictures")
    return dataset

In [14]:
year_start = date(year,1,1)
date_ranges = [(year_start + timedelta(n), year_start + timedelta(n+1)) for n in range(365)]

In [15]:
res = []
total_images = set()

In [195]:
tag = "dog"
for r in tqdm.tqdm(date_ranges[150:]):
    new_images = get_range_pictures(tag, r)
    if "id" in new_images.columns:
        total_images.update(new_images['id'].unique())
    print(f"{len(total_images)} total images")
    res.append(new_images)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=215.0), HTML(value='')))

Found 2 pages, with 528 unique pictures
117913 total images
Found 2 pages, with 628 unique pictures
118541 total images
Found 3 pages, with 1124 unique pictures
119665 total images
Found 2 pages, with 638 unique pictures
120303 total images
Found 2 pages, with 613 unique pictures
120916 total images
Found 2 pages, with 534 unique pictures
121450 total images
Found 3 pages, with 1071 unique pictures
122521 total images
Found 3 pages, with 1111 unique pictures
123632 total images
Found 2 pages, with 653 unique pictures
124285 total images
Found 2 pages, with 656 unique pictures
124941 total images
Found 2 pages, with 609 unique pictures
125550 total images
Found 2 pages, with 595 unique pictures
126145 total images
Found 2 pages, with 554 unique pictures
126699 total images
Found 2 pages, with 630 unique pictures
127329 total images
Found 2 pages, with 653 unique pictures
127982 total images
Found 2 pages, with 615 unique pictures
128597 total images
Found 2 pages, with 610 unique pictur

Found 2 pages, with 555 unique pictures
222227 total images
Found 2 pages, with 545 unique pictures
222772 total images
Found 2 pages, with 583 unique pictures
223355 total images
Found 3 pages, with 1038 unique pictures
224393 total images
Found 3 pages, with 1139 unique pictures
225532 total images
Found 3 pages, with 969 unique pictures
226499 total images
Found 2 pages, with 655 unique pictures
227154 total images
Found 2 pages, with 546 unique pictures
227700 total images
Found 2 pages, with 670 unique pictures
228370 total images
Found 2 pages, with 581 unique pictures
228951 total images
Found 2 pages, with 639 unique pictures
229590 total images
Found 4 pages, with 1466 unique pictures
231056 total images
Found 2 pages, with 571 unique pictures
231627 total images
Found 2 pages, with 715 unique pictures
232342 total images
Found 2 pages, with 550 unique pictures
232892 total images
Found 2 pages, with 654 unique pictures
233546 total images
Found 2 pages, with 619 unique pictur

In [196]:
dataset_year = pd.concat(res)
dataset_year = dataset_year.drop_duplicates().reset_index(drop=True)

In [199]:
dataset_year.to_csv(f"data/metadata/dog_metadata_raw_{year}.csv")

## Downloading images

In [4]:
def download_image_1024(im_row, im_path):
    '''
    Download the image corresponding to the raw in 
    size 1024 and save it at im_path.
    '''
    server_id  = im_row["server"]
    im_id  = im_row["id"]
    secret  = im_row["secret"]
    size_suffix  = "b"
    im_res = requests.get(f"https://live.staticflickr.com/{server_id}/{im_id}_{secret}_{size_suffix}.jpg")
    if im_res.status_code == 200:
        with open(im_path , "wb") as f:
            f.write(im_res.content)
            #print("Done", im_id)
    else:
        print("error downloading", im_id)

def download(im_row, download_path=f"data/dogs-1024-{year}"):
    im_id  = im_row["id"]
    im_path = os.path.join(download_path, f"{im_id}.jpg")
    if not os.path.isfile(im_path):
        download_image_1024(im_row, im_path)
        
def download_dataset(dataset, process=8):
    '''
    Download all the image from the corresponding dataset.
    '''
    with Pool(process) as p:
        p.map(download, [ir[1] for ir in dataset.iterrows()])

In [4]:
dataset = pd.read_csv(f"data/metadata/dog_metadata_raw_{year}.csv", index_col=0)
print(len(dataset))

274800


In [6]:
download_dataset(dataset)

error downloading 18346833310
error downloading 18878548039
error downloading 19090184745
error downloading 18511270424
error downloading 18142639281
error downloading 17520943963
error downloading 17518650424
error downloading 17520674883
error downloading 18000527239
error downloading 19150560922
error downloading 17987099289
error downloading 17585654065
error downloading 18658117705
error downloading 18530181043
error downloading 17550393614
error downloading 18460243789
error downloading 16945652574
error downloading 19178402262
error downloading 18190687104
error downloading 19436982099
error downloading 19081092339
error downloading 18186594383
error downloading 18255298836
error downloading 19257973402
error downloading 18199685234
error downloading 17121800823
error downloading 19075797170
error downloading 18254151636
error downloading 19075593378
error downloading 18825846691
error downloading 18254538946
error downloading 17741007501
error downloading 18245931296
error down

## Head extractor model

In [19]:
import matplotlib.pyplot as plt

import os
import numpy as np

import tensorflow as tf

from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder

import utils

%matplotlib inline

In [9]:
"""
Restore the pretrained model
"""

dog_class_id = 1
num_classes = 1
label_id_offset = 1
category_index = {dog_class_id: {'id': dog_class_id, 'name': 'rubber_ducky'}}

tf.keras.backend.clear_session()
num_classes = 1
pipeline_config = '../models/research/object_detection/configs/tf2/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.config'
checkpoint_path = 'models/detection/checkpoints/dog-finetuned-checkpoint-2'
configs = config_util.get_configs_from_pipeline_file(pipeline_config)
model_config = configs['model']
model_config.ssd.num_classes = num_classes
model_config.ssd.freeze_batchnorm = True
detection_model = model_builder.build(model_config=model_config, is_training=False)
checkpoint = tf.train.Checkpoint(model=detection_model)
checkpoint.restore(checkpoint_path)
image, shapes = detection_model.preprocess(tf.zeros([1, 640, 640, 3]))
prediction_dict = detection_model.predict(image, shapes)
_ = detection_model.postprocess(prediction_dict, shapes)

In [10]:
def detect(input_tensor):
    '''
    Detection for an image using the trained model.
    '''
    preprocessed_image, shapes = detection_model.preprocess(input_tensor)
    prediction_dict = detection_model.predict(preprocessed_image, shapes)
    return detection_model.postprocess(prediction_dict, shapes)

In [11]:
def get_bounding_boxe(im_path):
    """
    Get the best matching bounding boxe for an image at path im_path
    """
    test_image_np = np.expand_dims(utils.load_image_into_numpy_array(im_path), axis=0)
    input_tensor = tf.convert_to_tensor(test_image_np, dtype=tf.float32)
    pred = detect(input_tensor)
    return pred['detection_boxes'][0,0].numpy(), pred['detection_scores'][0,0].numpy()

## Extract heads

In [5]:
import pickle

bounding_boxes = {}

In [24]:
"""
Will detect the dog-faces in all the images.
And makes a backup every 1000 images
"""

for im_id in tqdm.tqdm(dataset['id']):
    im_path = os.path.join(f'data/dogs-1024-{year}}', f"{im_id}.jpg")
    if not (len(bounding_boxes) -1) % 1000 :
        with open("tmp/bounding_boxes.pkl", 'wb') as f:  
            pickle.dump(bounding_boxes,f)
    if im_id not in bounding_boxes:
        if os.path.isfile(im_path):
            try:
                bounding_boxes[im_id] = get_bounding_boxe(im_path)
            except ValueError:
                print(f"Wrong image format for {im_id}")
            except OSError:
                print("OSError")
        elif im_id not in bounding_boxes :
            print(f'Could not find. {im_id}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=274800.0), HTML(value='')))

Wrong image format for 16168203792
Wrong image format for 16168694775
Wrong image format for 15547930933
Wrong image format for 16167466975
Wrong image format for 15981380059
Wrong image format for 15980349449
Wrong image format for 15558943814
Wrong image format for 16188149681
Wrong image format for 16004957577
Wrong image format for 15567336863
Wrong image format for 16188244045
Wrong image format for 16193901151
Wrong image format for 15574559684
Wrong image format for 16170525206
Wrong image format for 15575965853
Wrong image format for 16023906850
Wrong image format for 16025343017
Wrong image format for 16021264268
Wrong image format for 15598727033
Wrong image format for 15597158283
Wrong image format for 16032064998
Wrong image format for 16224342105
Wrong image format for 16232626192
Wrong image format for 16054947528
Wrong image format for 16242248825
Wrong image format for 15621770463
Wrong image format for 16246303792
Wrong image format for 16060492897
Wrong image format f

Wrong image format for 16644119707
Wrong image format for 16638696127
Wrong image format for 16845494432
Wrong image format for 16226367423
Wrong image format for 16238825083
Could not find. 16671745720
Wrong image format for 16236112444
Wrong image format for 16649973897
Wrong image format for 16668913708
Wrong image format for 16647683137
Wrong image format for 16667336600
Wrong image format for 16869541675
Wrong image format for 16248255463
Could not find. 16840566446
Wrong image format for 16866521805
Wrong image format for 16245437813
Wrong image format for 16680391459
Wrong image format for 16690763509
Wrong image format for 16254525614
Wrong image format for 16264149394
Wrong image format for 16890556695
Wrong image format for 16889109171
Wrong image format for 16680103317
Could not find. 16280152073
Wrong image format for 16896177122
Wrong image format for 16288611434
Could not find. 16721848179
Wrong image format for 16731368310
Wrong image format for 16296451874
Wrong image f

Wrong image format for 17440653943
Wrong image format for 17875505948
Wrong image format for 18034341516
Wrong image format for 17424429164
Wrong image format for 18085435196
Wrong image format for 17909148140
Wrong image format for 17902893499
Wrong image format for 17959751989
Wrong image format for 17518997664
Could not find. 18142639281
Could not find. 17520943963
Could not find. 17518650424
Could not find. 17520674883
Could not find. 18000527239
Wrong image format for 17995076070
Could not find. 17987099289
Could not find. 17550393614
Wrong image format for 17565940153
Wrong image format for 17543872004
Wrong image format for 17975140790
Wrong image format for 17975102099
Wrong image format for 17533562124
Wrong image format for 17626883653
Wrong image format for 17619003234
Wrong image format for 18241273685
Wrong image format for 18207127436
Wrong image format for 18285562262
Wrong image format for 18097868800
Wrong image format for 18286267781
Wrong image format for 18283606331

Wrong image format for 19357093094
Wrong image format for 19963332382
Wrong image format for 19391998414
Wrong image format for 19814248059
Wrong image format for 19999785095
Wrong image format for 19993063122
Could not find. 19846694328
Could not find. 19413644293
Could not find. 20033982685
Could not find. 19412925353
Could not find. 20025904042
Could not find. 19845121518
Wrong image format for 19845028759
Wrong image format for 19879898789
Wrong image format for 20093472385
Could not find. 20080646492
Wrong image format for 19892853658
Wrong image format for 19508359373
Wrong image format for 20115462242
Wrong image format for 19502481593
Wrong image format for 19499295193
Wrong image format for 19932184298
Wrong image format for 20122829046
Wrong image format for 19526836933
Wrong image format for 19536435323
Wrong image format for 20176644592
Could not find. 19557862694
Wrong image format for 20161872816
Wrong image format for 20152403976
Could not find. 19983733809
Wrong image f

Wrong image format for 21660691619
Wrong image format for 21847380975
Wrong image format for 21856658631
Wrong image format for 21815960096
Wrong image format for 21836334665
Could not find. 21823609226
Could not find. 21848507165
Wrong image format for 21845360886
Wrong image format for 21703241839
Wrong image format for 21714362679
Wrong image format for 21274697634
Wrong image format for 21708550089
Wrong image format for 22357238382
Wrong image format for 21766337159
Wrong image format for 21917643836
Wrong image format for 21752872930
Wrong image format for 21765358448
Wrong image format for 21766696659
Wrong image format for 21987811181
Wrong image format for 21974756682
Wrong image format for 21795973358
Wrong image format for 21980666235
Wrong image format for 21799125388
Wrong image format for 21962760542
Could not find. 21382288373
Wrong image format for 21964764752
Wrong image format for 21403869444
Wrong image format for 21998568986
Wrong image format for 21426731293
Wrong 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Wrong image format for 22598044873
Wrong image format for 23225133825
Wrong image format for 23219373115
Wrong image format for 23217515075
Wrong image format for 22634189063
Wrong image format for 23266132755
Wrong image format for 22660353584
Wrong image format for 23172910742
Wrong image format for 22884880327
Wrong image format for 23197773971
Wrong image format for 22689402594
Could not find. 23222596852
Wrong image format for 22703078663
Wrong image format for 22701806174
Wrong image format for 23253196491
Wrong image format for 22701270374
Wrong image format for 23303476586
Wrong image format for 15990777814
Wrong image format for 23298908316
Wrong image format for 23067787790
Wrong image format for 22988206509
Wrong image format for 22727539924
Could not find. 23348992415
Wrong image format for 23264749491
Wrong image format for 23018589269
Wrong image format for 23386552395
Wrong image format for 23278131922
Wrong image format for 22743226924
Wrong image format for 22777359364

In [6]:
# with open(f"data/metadata/bounding_boxes_dog_{year}.pkl", 'rb') as f:
#     bounding_boxes = pickle.load(f)

## Cropping pictures

In [13]:
def get_face(boxe, im, aug_factor=0.1):
    '''
    return a square image of the face
    by cropping the input image.
    '''
    image_shape = im.size
    x_min =(np.round((boxe[1] - (boxe[3]-boxe[3])*aug_factor)*image_shape[0]))
    y_min =(np.round((boxe[0] - (boxe[2]-boxe[0])*aug_factor)*image_shape[1]))
    x_max =(np.round((boxe[3] + (boxe[3]-boxe[3])*aug_factor)*image_shape[0]))
    y_max =(np.round((boxe[2] + (boxe[2]-boxe[0])*aug_factor)*image_shape[1]))
    square_half = ((x_max - x_min) + (y_max - y_min))/4
    x_min2 = int((x_min+x_max) / 2 - square_half)
    x_max2 = int((x_min+x_max) / 2 + square_half)
    y_min2 = int((y_min+y_max) / 2 - square_half)
    y_max2 = int((y_min+y_max) / 2 + square_half)
    return im.crop((x_min2,y_min2,x_max2,y_max2))

In [14]:
too_small = []
results = {}

In [17]:
bd_boxes = bounding_boxes

In [20]:
"""
Crop all the images for which a face has been detected.
this part is quite fast compared to the prvious steps.
"""

class BadImage(Exception):
    pass

for im_id in tqdm.tqdm(dataset['id']):
    im_path = os.path.join(f'data/dogs-1024-{year}', f"{im_id}.jpg")
    try:
        if im_id not in bd_boxes:
            #print("No bd box")
            results[im_id] = "no_bd_box"
            raise BadImage   
        im = Image.open(im_path)
        proba = bd_boxes[im_id][1]
        if proba < 0.4:
            #print("Dog not found")
            results[im_id] = "dog_not_found"
            raise BadImage
        im = get_face(bd_boxes[im_id][0], im)
        h,w = im.size
        assert abs(h-w) < 3, f"{h} {w}"
        if h < 128:
            #print("too small")
            results[im_id] = "too_small"
            raise BadImage
        im_path = os.path.join(f'data/dogs-face-{year}', f"{im_id}.jpg")
        im.save(im_path)
        results[im_id] = "ok"
    except OSError:
        #print("Image not found")
        results[im_id] = "image_not_found"
    except BadImage:
        pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=174800.0), HTML(value='')))


