# Download data from urls

Get data from urls obtained in *datascraping.ipynb*.

## Cencepts from paper

Concepts from Table 1 in [Measuring Semantic Similarity between Concepts in Visual Domain](http://ieeexplore.ieee.org/document/4665152/).  
Concepts are corresponding to categories in our paper.

In [None]:
concepts = [
    "bay",
    "beach",
    "birds",
    "boeing",
    "buildings",
    "city",
    "clouds",
    "face",
    "f-16",
    "helicopter",
    "mountain",
    "sky",
    "ships",
    "sunset",
    "sunrise",
    "ocean"
]

## Download images using obtained list

Define several functions.

In [None]:
def read_urls(cat):
    with open("urls/{}.txt".format(cat), encoding='utf8') as f:
        lines = f.readlines()
    return [line.rsplit('\n')[0] for line in lines]

In [None]:
from concurrent import futures
import urllib.request

In [None]:
accepted = {"jpeg", "png", "gif"}

def get_one(dirname, urlidx, urls):
    url = urls[urlidx]
    with urllib.request.urlopen(url) as response:
        ext = response.info().get_content_subtype()
        if not ext in accepted:
            return False
    urllib.request.urlretrieve(url, "{}/{:04d}.{}".format(dirname, urlidx, ext))
    return True

In [None]:
MAX_WORKERS = 10

In [None]:
def download_many_no_error_handle(urls, dirname):
    with futures.ThreadPoolExecutor(MAX_WORKERS) as executor:
        res = executor.map(lambda idx: get_one(dirname, idx, urls), range(len(urls)))
    _ = len(list(res))
    print("done")

In [None]:
import tqdm
from  http.client import RemoteDisconnected
from http.client import HTTPException
import ssl

def download_many(urls, dirname):
    to_do_map = {}
    fails = []
    with futures.ThreadPoolExecutor(MAX_WORKERS) as executor:
        for i in range(len(urls)):
            future = executor.submit(get_one,
                            dirname, i, urls)
            to_do_map[future] = i
        done_iter = futures.as_completed(to_do_map)
        
        done_iter = tqdm.tqdm(done_iter, total=len(urls))
        
        notify_err = lambda msg: None
        
        
        for future in done_iter:
            idx = to_do_map[future]
            try:
                res = future.result()
                if not res:
                    notify_err("Unknown mime type: {}".format(urls[idx]))
                    fails.append(urls[idx])
            # make exception handling separately for debug purpose (now we can merge, but not yet)
            except (urllib.error.HTTPError, RemoteDisconnected, ssl.CertificateError or OSError, UnicodeEncodeError, urllib.error.URLError):
                notify_err("urllib retrieve rrorr: {}".format(urls[idx]))
                fails.append(urls[idx])
            except HTTPException:
                notify_err("HTTPException rrorr: {}".format(urls[idx]))
                fails.append(urls[idx])
                
    return fails

In [None]:
def download_one_category(cat):
    urls = read_urls(cat)
    dirname = "data/{}".format(cat)
    
    !mkdir -p $dirname
    fail_urls = download_many(urls, dirname)

### Execute donwload

In [None]:
list(map(download_one_category, concepts))

### Cleanup corrupted images

Remove files that cannot be loaded.

In [None]:
from PIL import Image
import glob
import shutil
import os
from keras.preprocessing.image import img_to_array, load_img
BROKEN = 'corrupted'

In [None]:
def move_to_broken(f):
    moveddir = os.path.dirname(os.path.join(BROKEN, f))
    os.path.isdir(moveddir) or os.makedirs(moveddir)
    shutil.copy(f, moveddir)
    os.remove(f)
    print("{} is broken, move it.".format(f))

In [None]:
TARGET = "data"
TARGET_SIZE=(256, 256)

os.makedirs(BROKEN, exist_ok=True)

for f in glob.iglob("{}/**/*.*".format(TARGET), recursive=True):
    try:
        _ = load_img(f, grayscale=False,
                           target_size=TARGET_SIZE)
    except:
        move_to_broken(f)