# Google Image Downloader

By [Jeff Ocampo](www.JOcampo.com)

__Purpose:__ Automate the download of images from Google to support other analyses. 

__Background:__ Had originally used this method to support some image recognition work I was doing with CNN's. Decided to formalize this part more to make it a bit easier to get pics for working with. 

__Parameters:__ (1) Search term (2) number of images to retrieve (3) base directory

__Returns:__ A folder in the base directory that has (1) an Excel with a list of files (2) another folder with the images

__Other things of note:__ (1) Only works for .jpgs (2) Sometimes query returns extra files that aren't jpgs, so the process ignores files it can't work with. 

</br>

## User Set Parameters
</br>

In [17]:
search_term = 'xray'
number_of_images = 1000
base_directory = 'd:/projects/python/Google_Image_Downloader/'

## Processing
</br>

__Import packages__

In [18]:
import os
import sys
from google_images_download import google_images_download
import pandas as pd
import numpy as np
import shutil 

__Define temp variables__

In [19]:
temp_folder = base_directory + 'temp/'
results_folder = base_directory + search_term + '_results/'
results_image_folder = results_folder + 'images/'
log_file = temp_folder + 'log.txt'
excel_list = results_folder + 'file_list.xlsx'

__Create folder locations__

In [20]:
if not os.path.exists( temp_folder ):
    os.makedirs( temp_folder )
    
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

if not os.path.exists( results_image_folder ):
    os.makedirs( results_image_folder )

__Retrieve images from Google__

In [21]:
os.chdir(temp_folder)

create_log = open(log_file, 'a')
create_log.close()

log = open(log_file, 'w')

sys.stdout = log 

response = google_images_download.googleimagesdownload()

arguments = {"keywords":search_term
             ,"limit":number_of_images
             ,"print_urls":True
             ,"Type":"photo"
             ,"format":"jpg"
            ,"chromedriver":"C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe" }
paths = response.download(arguments)

log.close()

sys.stdout = sys.__stdout__

__Process files returned__

In [22]:
search_log_file = open(log_file, 'r')
lines = search_log_file.read().split('\n')
search_log_file.close()

search_url = []
search_jpg_name = []

for i in range(0,len(lines)+1):
    if i < len(lines)-1:
        line1 = lines[i]
        j = i + 1
        line2 = lines[j]
        if line1[0:5].lower() == 'image' and line2[0:5].lower() == 'compl':
            search_url.append( line1.replace('Image URL: ', '').replace("'","") )
            search_jpg_name.append( line2.replace('Completed Image ====> ', '').replace("'","") )
            
dl_search_list = pd.DataFrame( np.column_stack((search_jpg_name, search_url)) )
dl_search_list.columns = ['jpg_name', 'url']
dl_search_list.set_index('jpg_name', inplace=True)

In [23]:
download_path = temp_folder + 'downloads/' + search_term + '/'

search_files = [fn for fn in os.listdir(download_path) if fn.upper().endswith('.JPG')]

file_names = []
original_names = []
urls = []

for i in range(0,len(search_files)):
    new_name = results_image_folder + search_term + '_' + str(i) +'.JPG'
    org_file = download_path + search_files[i]
    file_names.append( search_term + '_' + str(i))
    original_names.append( search_files[i] )
    urls.append( dl_search_list.loc[search_files[i]]['url'] )
    os.rename(org_file, new_name)

final_list = pd.DataFrame( np.column_stack((file_names, original_names, urls)) )
final_list.columns = ['file_name', 'jpg_name', 'url']
final_list.set_index('file_name', inplace=True)

writer = pd.ExcelWriter( excel_list )
final_list.to_excel(writer, search_term)
writer.save()
writer.close()

__Cleaning__

In [24]:
if os.path.exists( temp_folder + '/downloads/' + search_term + '/' ):
    os.chdir(temp_folder + '/downloads/' + search_term + '/')
    for file in os.listdir():
        os.remove(file)
    
    os.chdir(base_directory)
    
if os.path.exists( temp_folder + 'log.txt' ):
    os.remove( temp_folder + 'log.txt' )

if os.path.exists( temp_folder + 'downloads/' + search_term + '/' ):
    os.rmdir( temp_folder + 'downloads/' + search_term + '/')

if os.path.exists( temp_folder + 'downloads/' ):    
    os.rmdir( temp_folder + 'downloads/')

if os.path.exists( temp_folder ):
    os.rmdir( temp_folder )


