In [1]:
import os
import sys
import random
from google_images_download import google_images_download
from PIL import Image

In [3]:
def download_data(cat_info, out_dir, args):
    for ci in cat_info:
        args["keywords"] = ci['keyword']
        args["output_directory"] = f"{out_dir}/{ci['name']}"
        gid = google_images_download.googleimagesdownload()
        gid.download(args)

In [4]:
def mkdir_if_needed(path):
    if not os.path.exists(path):
        print(f"Creating directory {path}")
        os.mkdir(path)
    elif not os.path.isdir(path):
        print(f'{path} already exists and is not a directory')
        raise FileExistsError
        
def mkdir_and_parents(path):
    print("Path: {}".format(path))
    ancestors=path.split('/')
    for i in range(len(ancestors)):
        subpath = ('/' if path[0] == '/' else '') + '/'.join(ancestors[:(i+1)])
        print(f"Subpath: {subpath}")
        mkdir_if_needed(subpath)
        
    
def prepare_category_directories(new_dir, categories):
    for cat in categories:
        mkdir_and_parents(f'{new_dir}/{cat.name}')

In [5]:
def check_download_data(category_info, download_root):
    valid_images = {}
    
    def is_valid_image(fp):
        try:
            im = Image.open(fp)
            im.load()
        except Exception as e:
            print(f"Error loading {fp}: {e}")
            return False
        return True
        
    for ci in category_info:
        valid_images[ci['name']] = []
        walk_res = os.walk("{}/{}".format(download_root, ci['name']))
        for wr in walk_res:
            for fn in wr[2]:
                fp = os.path.join(wr[0], fn)
                if is_valid_image(fp):
                    valid_images[ci['name']].append(fp)

    return valid_images

In [6]:
def choose_mode(modes, mode_split):
    st = sum(mode_split)
    rv = random.random() * st
    ct = 0
    for m,s in zip(modes, mode_split):
        if rv < (ct + s):
            return m
        ct += s

In [7]:
def copy_image(src_path, dst_path):
    try:
        im = Image.open(src_path)
        im.save(dst_path)
    except Exception as e:
        print(f"Error trying to save image. Src: {src_path}. Error: {str(e)}")

In [8]:
def copy_images(valid_images, problem_dir):
    data_modes = ['train', 'eval', 'test']
    data_split = [0.8, 0.1, 0.1]

    for dm in data_modes:
        mkdir_and_parents(f'{problem_dir}/{dm}')
    
    test_images = []

    for key in valid_images:
        non_test_modes = [dm for dm in data_modes if 'test' not in dm]
        cur_counts = {}
        for dm in non_test_modes:
            cur_counts[dm] = 0
            mkdir_and_parents(f'{problem_dir}/{dm}/{key}')
        for f in valid_images[key]:
            mode = choose_mode(data_modes, data_split)
            if mode == 'test':
                test_images.append((key, f))
            else:
                cur_counts[mode] += 1
                dest = f"{problem_dir}/{mode}/{key}/{cur_counts[mode]:0{4}}.jpg"
                print(f"Copying {f} to {dest}")
                copy_image(f, dest)

    random.shuffle(test_images)

    test_count = 0
    with open(f"{problem_dir}/test/test_ground_truth.txt", 'w') as of:
        for i,ti in enumerate(test_images):
            test_num = i + 1
            dest = f"{problem_dir}/test/{test_num:0{4}}.jpg"
            print(f"Copying {ti[1]} to {dest}")
            copy_image(ti[1], dest)
            of.write("{}\t{}\n".format(dest, ti[0]))

            

In [9]:
def prepare(download_root, category_info, problem_dir, args):
    
    mkdir_and_parents(f'{download_root}')
    download_data(category_info, download_root, args)
    valid_images = check_download_data(category_info, download_root)
    copy_images(valid_images, problem_dir)

In [12]:
!rm -rf data/cities/
!rm -rf data/test_downloads

In [13]:
category_info = [{
'keyword': 'Tokyo',
'name': 'tokyo'},
{
'keyword': 'New York',
'name': 'newyork'}]

args = {
    "limit" : 100,
    "type": "photo",
    "size": "medium",
    "format": "jpg"
}

ROOT_DIR='./data'
download_root = f'{ROOT_DIR}/test_downloads'
problem_name = 'cities'
problem_dir = f"{ROOT_DIR}/{problem_name}"
prepare(download_root, category_info, problem_dir, args)

Path: ./data/test_downloads
Subpath: .
Subpath: ./data
Subpath: ./data/test_downloads
Creating directory ./data/test_downloads

Item no.: 1 --> Item name = Tokyo
Evaluating...
Starting Download...
Completed Image ====> 1. n-tokyo-a-20180715-870x580.jpg
Completed Image ====> 2. https%3a%2f%2fs3-ap-northeast-1.amazonaws.com%2fpsh-ex-ftnikkei-3937bb4%2fimages%2f5%2f9%2f3%2f7%2f13677395-2-eng-gb%2fthumbnail_image-1.jpg
Completed Image ====> 3. n-tokyo-a-20171013-870x581.jpg
Completed Image ====> 4. hero_tokyo.jpg
Completed Image ====> 5. tokyo-main.jpg
Completed Image ====> 6. lek_office_photos_2017_0004_tokyo_0.jpg
Completed Image ====> 7. tokyo_best_student_cities.jpg
Completed Image ====> 8. 0441edc7-tokyo-banner--.jpg
Completed Image ====> 9. tokyo-earthquake-tokyo-earthquake-japan-tokyo-earthquake-latest-tokyo-earthquake-magnitude-tokyo-earthquake-usgs-tokyo-939132.jpg
Completed Image ====> 10. tokyo_780x520px.ash
Completed Image ====> 11. rcyhr61w_tokyo2.jpg
Completed Image ====> 12.

Completed Image ====> 39. 51s-yl80%2bwl.jpg
Completed Image ====> 40. new-york-2048866_960_720.jpg
Completed Image ====> 41. 08e99cb5_z.jpg
Completed Image ====> 42. times-square-new-york.jpg
Completed Image ====> 43. ficha_nyc.jpg
Completed Image ====> 44. view-of-new-york-brooklyn-bridge-night-slider-big-bus-tours-jan-2017.jpg
Completed Image ====> 45. new-york-ase.jpg
Completed Image ====> 46. 22998-1-ny-in-a-day.jpg
Completed Image ====> 47. ph1.jpg
Completed Image ====> 48. 2%e9%9a%8e%e5%bb%ba%e3%81%a6%e3%83%90%e3%82%b9%e3%81%ab%e3%82%88%e3%82%8b%e3%83%8b%e3%83%a5%e3%83%bc%e3%83%a8%e3%83%bc%e3%82%af%e5%b8%82%e3%82%ac%e3%82%a4%e3%83%89%e4%bb%98%e3%81%8d%e8%a6%b3%e5%85%89%e3%83%84%e3%82%a2%e3%83%bc-in-new-york-city-402398.jpg
Completed Image ====> 49. m_1510260284_588_3.jpg
Completed Image ====> 50. 121321730.jpg
Completed Image ====> 51. new-york-travel.jpg
Completed Image ====> 52. 2500px-skyline-new-york-city.jpg
Completed Image ====> 53. directly-from-ny.jpg
Completed Image ====

Copying ./data/test_downloads/tokyo/Tokyo/9. tokyo-earthquake-tokyo-earthquake-japan-tokyo-earthquake-latest-tokyo-earthquake-magnitude-tokyo-earthquake-usgs-tokyo-939132.jpg to ./data/cities/train/tokyo/0024.jpg
Copying ./data/test_downloads/tokyo/Tokyo/90. 1436325843043.jpg to ./data/cities/train/tokyo/0025.jpg
Copying ./data/test_downloads/tokyo/Tokyo/97. shop01.jpg to ./data/cities/train/tokyo/0026.jpg
Copying ./data/test_downloads/tokyo/Tokyo/3. n-tokyo-a-20171013-870x581.jpg to ./data/cities/train/tokyo/0027.jpg
Copying ./data/test_downloads/tokyo/Tokyo/35. ogp_ja.jpg to ./data/cities/train/tokyo/0028.jpg
Copying ./data/test_downloads/tokyo/Tokyo/69. tokyo-tower-69466.jpg to ./data/cities/train/tokyo/0029.jpg
Copying ./data/test_downloads/tokyo/Tokyo/10. tokyo_780x520px.ash to ./data/cities/train/tokyo/0030.jpg
Copying ./data/test_downloads/tokyo/Tokyo/4. hero_tokyo.jpg to ./data/cities/train/tokyo/0031.jpg
Copying ./data/test_downloads/tokyo/Tokyo/74. movie_thumb.jpg to ./data/c

Copying ./data/test_downloads/newyork/New York/95. time-square-new-york-city-picture-id487537456?b=1&k=6&m=487537456&s=612x612&w=0&h=zumik3gqgmj3ynctej4-tm0dem22xo0onhhxk_ff03u=.jpg to ./data/cities/train/newyork/0007.jpg
Copying ./data/test_downloads/newyork/New York/29. zero-newyork_mv.jpg to ./data/cities/train/newyork/0008.jpg
Copying ./data/test_downloads/newyork/New York/93. newyorkcity-web1024x768-02-36d4b322.jpg to ./data/cities/train/newyork/0009.jpg
Copying ./data/test_downloads/newyork/New York/62. prices_01.jpg to ./data/cities/eval/newyork/0002.jpg
Copying ./data/test_downloads/newyork/New York/36. newyork.jpg to ./data/cities/train/newyork/0010.jpg
Copying ./data/test_downloads/newyork/New York/32. new-york-aerial.jpg to ./data/cities/eval/newyork/0003.jpg
Copying ./data/test_downloads/newyork/New York/84. img_roppongi.jpg to ./data/cities/train/newyork/0011.jpg
Copying ./data/test_downloads/newyork/New York/47. ph1.jpg to ./data/cities/train/newyork/0012.jpg
Copying ./da

Copying ./data/test_downloads/newyork/New York/8. pexels-photo-450597.jpeg to ./data/cities/train/newyork/0071.jpg
Copying ./data/test_downloads/newyork/New York/54. new_york_destination-images_673x263.jpg to ./data/cities/train/newyork/0072.jpg
Copying ./data/test_downloads/newyork/New York/6. 27007567.jpg to ./data/cities/train/newyork/0073.jpg
Copying ./data/test_downloads/newyork/New York/21. new-york-2017-columbus-circle-day?wid=720&hei=1080&fmt=jpeg&qlt=75,0&op_sharpen=0&resmode=sharp2&op_usm=0.8,0.8,5,0&iccembed=0&printres=72&fit=crop.jpg to ./data/cities/train/newyork/0074.jpg
Copying ./data/test_downloads/newyork/New York/31. ilsu_dest_nyc_stage_desktop.jpg to ./data/cities/train/newyork/0075.jpg
Copying ./data/test_downloads/newyork/New York/13. image.jpg to ./data/cities/train/newyork/0076.jpg
Copying ./data/test_downloads/newyork/New York/53. directly-from-ny.jpg to ./data/cities/train/newyork/0077.jpg
Copying ./data/test_downloads/newyork/New York/94. ny2_3169807a-large.jp