## Make your own face dataset - Notebook 1


####**Designed by Joon Son Chung**

This script downloads images from Bing Image Search. At the time of writing, the API is free for up to 1,000 search queries per month.

Modify the following parameters, then click `Runtime > Run all`.

### **Section A1** - Import packages and set parameters
- Initialize the Colab instance.
- Mount Google Drive and set necessary paths.
- Make sure `FOLDER` exists in your Google Drive. This will not be made automatically.

In [None]:
from google.colab import drive
import os, glob, sys, numpy, cv2, random, requests, shutil, pdb, json, time

# mount Google Drive
drive.mount('/content/drive', force_remount=True)

# path of the data directory relative to the home folder of Google Drive
GDRIVE_HOME = '/content/drive/MyDrive'
FOLDER      = 'face_dataset' # This is the directory where your files will be saved

# this is the folder to write to
data_dir        = os.path.join(GDRIVE_HOME,FOLDER)
temp_path       = './downloaded_images'
assert os.path.exists(data_dir)

# max number of images per identity
max_results = 150

### **Task A2** - Set parameters below
- Set the **API key** and the **queries** (`words`).

In [None]:
# your Bing API key (Bing Search V7)
############################################# TODO #############################################
API_KEY = ""
################################################################################################

# keywords to search (names of people)
############################################# TODO #############################################
words = []
################################################################################################

print('We are going to search and download images for',len(words),'identities')

### **Section A3** - Search and download script
- This is the tool for searching and downloading from Bing.

In [None]:
def search_and_download(term,tgt_dir,API_KEY,MAX_RESULTS=250,GROUP_SIZE=50):

  # Saved at tgt_dir/term
  save_dir = os.path.join(tgt_dir,term)

  # Make directory if missing
  os.makedirs(save_dir, exist_ok=True)

  # Allowed extensions in CAPITALS
  allow_exts = ['.JPG','.JPEG','.PNG']

  URL = "https://api.bing.microsoft.com/v7.0/images/search"

  headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
  params = {"q": term, "offset": 0, "count": GROUP_SIZE}

  # make the search
  print("[INFO] searching Bing API for '{}'".format(term))
  search = requests.get(URL, headers=headers, params=params)
  search.raise_for_status()

  # grab the results from the search, including the total number of estimated results returned by the Bing API
  results = search.json()
  estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
  print("[INFO] {} total results for '{}'".format(estNumResults, term))

  # initialize the total number of images downloaded thus far
  total = 0

  # loop over the estimated number of results in `GROUP_SIZE` groups
  for offset in range(0, estNumResults, GROUP_SIZE):

    # update the search parameters using the current offset, then
    # make the request to fetch the results
    print("[INFO] making request for group {}-{} of {}...".format(offset, offset + GROUP_SIZE, estNumResults))
    params["offset"] = offset
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()
    results = search.json()
    print("[INFO] saving images for group {}-{} of {}...".format(offset, offset + GROUP_SIZE, estNumResults))

    # loop over the results
    for v in results["value"]:

      # try to download the image
      try:

        # determine the path to the output image
        ext = v["contentUrl"][v["contentUrl"].rfind("."):]
        save_file = os.path.sep.join([save_dir, "B{}{}".format(str(total).zfill(8), ext)])
        save_json = os.path.sep.join([save_dir, "B{}{}".format(str(total).zfill(8), '.json')])

        # if extension is not in a list, continue
        if ext.upper() not in allow_exts:
          print("[INFO] extension not allowed: {}".format(v["contentUrl"]))
          continue

        # make a request to download the image
        print("[INFO] fetching: {}".format(v["contentUrl"]))
        r = requests.get(v["contentUrl"], timeout=30)

        # write the image to disk
        with open(save_file, "wb") as f:
          f.write(r.content)
        with open(save_json, "w") as json_file:
          json.dump(v, json_file, indent=2)

      # catch any errors that would not unable us to download the image
      except:
        print("[INFO] skipping: {}".format(v["contentUrl"]))
        continue

      # try to load the image from disk
      image = cv2.imread(save_file)

      # if the image is `None`, then delete the image and the json file
      if image is None:
        print("[INFO] deleting: {}".format(save_file))
        if os.path.exists(save_file):
          os.remove(save_file)
        if os.path.exists(save_json):
          os.remove(save_json)
        continue

      # update the counter
      total += 1

### **Section A4** - Execute download script
- This part executes the download script, and creates zip files for each identity.

In [None]:
for word in words:
  # get save path
  zip_path = data_dir+'/B_{}'.format(word)

  # skip if zip file already exists
  if os.path.exists(zip_path+'.zip'):
    print('[INFO] skipping {} since zip file already exists'.format(word))
    continue;

  # search and download
  search_and_download(word,temp_path,API_KEY,MAX_RESULTS=max_results)

  # make archive of the folder for this person
  shutil.make_archive(zip_path, 'zip', root_dir=os.path.join(temp_path,word))