In [1]:
import os
os.mkdir('output')

In [2]:
import urllib.request
from urllib.error import HTTPError, URLError
from http.client import HTTPException
from socket import timeout
from random import choice, shuffle
from PIL import Image, ImageFile
from PIL import UnidentifiedImageError
ImageFile.LOAD_TRUNCATED_IMAGES = True
from io import BytesIO
from ssl import CertificateError
import re
regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

class ImageNet():

  def __init__(self):
    with urllib.request.urlopen('http://www.image-net.org/api/text/imagenet.synset.obtain_synset_list') as response:
      self._wnid_list = response.read().decode('UTF-8').split('\n')
      self._wnid_list.remove('n07749582') # Remove Imagenet lemons !
      self._downloaded = set()

  def get_random_image(self):
    while True:
      wnid = choice(self._wnid_list)
      with urllib.request.urlopen(f'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={wnid}') as response:
        image_list = response.read().decode('UTF-8').split('\n')

      shuffle(image_list)
      for image in image_list:
        if image=='':
          continue
        if image in self._downloaded:
          continue
        self._downloaded.add(image)
        if re.match(regex,image image):
          continue
        try:
          with urllib.request.urlopen(image, timeout=0.5) as response:
            out = Image.open(BytesIO(response.read()))
            if out.size[0] < 100 or out.size[1] < 100:
              raise ValueError('Image too small')
            return out
        except (HTTPError, URLError, timeout, UnidentifiedImageError, CertificateError, ConnectionError, UnicodeEncodeError, ValueError, OSError, HTTPException):
          pass

In [3]:
image_net = ImageNet()

In [4]:
from tqdm import tqdm

for i in tqdm(range(5000)):
  image_net.get_random_image().convert('RGB').save(f'output/{i}.jpg')

100%|██████████| 5000/5000 [3:38:18<00:00,  2.62s/it]


In [5]:
from shutil import make_archive, copyfile
make_archive('imagenet', 'zip', 'output')
copyfile('imagenet.zip','drive/MyDrive/Colab Notebooks/SIGNATEHiroshimaLemon/data/imagenet.zip')

'drive/MyDrive/Colab Notebooks/SIGNATEHiroshimaLemon/data/imagenet.zip'