# Selenium 구글 이미지 스크래핑

## Selenium 및 웹 드라이버 설치

In [None]:
!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome('chromedriver', options=chrome_options)

Collecting Selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 2.6MB/s 
Installing collected packages: Selenium
Successfully installed Selenium-3.141.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:8 htt

## 필요 라이브러리 import

In [None]:
import os
import time
import socket

from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException
from PIL import Image

## `scroll_down()`: 스크롤을 내리는 함수

In [None]:
def scroll_down():
  scroll_count = 0
  print("[scroll_down(): 스크롤 다운 시작]")

  last_height = wd.execute_script("return document.body.scrollHeight")
  after_click = False

  while True:
    print(f"[스크롤 다운: {scroll_count}]")
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    scroll_count += 1
    time.sleep(1)

    #새로운 현재의 하이트 값이 몇인지
    new_height = wd.execute_script("return document.body.scrollHeight")

    if last_height == new_height:
      if after_click is True:
        break
      else:
        try:
          more_button = wd.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input')
          if more_button.is_displayed():
            more_button.click()
            after_click = True
        except NoSuchElementException as e:
          print(e)
          break

    last_height = new_height

## `click_and_save()`: 썸네일 이미지 선택 후 원본 이미지 저장

In [None]:
def click_and_save(dir_name, index, img, img_list_length):
  global scraped_count

  try:
    img.click()
    wd.implicitly_wait(3)
    src = wd.find_element_by_xpath('//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute('src')
    
    if src.split('.')[-1] =='png':
      urlretrieve(src, dir_name + '/' + str(scraped_count +1) + ".png")
      print(f"{index+1}/{img_list_length} PNG 이미지 저장")
    else:
      urlretrieve(src, dir_name + '/' + str(scraped_count +1) + ".jpg")
      print(f"{index+1}/{img_list_length} JPG 이미지 저장")


    scraped_count += 1
  
  except HTTPError as e:
    print(e)
    pass
  
  except ElementClickInterceptedException as e:
      print(e)
      pass

## `scraping()`: 구글 이미지 스크래핑 시작

In [None]:
def scraping(dir_name, query):
  global scraped_count

  url = f"https://www.google.com/search?q={query}&tbm=isch&hl=ko&tbs=ko&tbs=isz%3Al&sa=X&ved=0CAEQpwVqFwoTCLDc_PCGi-sCFQAAAAABAC&biw=1522bih=780"
  wd.get(url)
  wd.maximize_window()

  scroll_down()

  div = wd.find_element_by_xpath('//*[@id="islrg"]/div[1]')
  img_list = div.find_elements_by_css_selector('div.bRMDJf.islir > img')
  print(img_list)
  for index, img in enumerate(img_list):
    try:
      click_and_save(dir_name, index, img, len(img_list))
    except ElementClickInterceptedException as e:
      print(e)
      wd.execute_script("window.scrollTo(0, window.scrollY + 100")
      time.sleep(1)
      click_and_save(dir_name, index, img, len(img_list))
    except NoSuchElementException as e:
      print(e)
      wd.execute_script("window.scrollTo(0, window.scrollY + 100")
      time.sleep(1)
      click_and_save(dir_name, index, img, len(img_list))
    except ConnectionResetError as e:
      print(e)
      pass
    except URLError as e:
      print(e)
      pass
    except socket.timeout as e:
      print(e)
      pass
    except socket.gaierror as e:
      print(e)
      pass
    except ElementNotInteractableException as e:
      print(e)
      break

  try:
    print("[스크랩핑 종료 (성공률: %.2f%%)]" % (scraped_count / len(img_list) * 100.0))
  except ZeroDivisionError as e:
    print(e)

  wd.quit()

## `filter_and_remove()`: 일정 해상도 이하이거나 손상된 이미지 제거

In [None]:
def filter_and_remove(dir_name, query, filter_size):
  filtered_count = 0

  for index, file_name in enumerate(os.listdir(dir_name)):
    try:
      file_path = os.path.join(dir_name, file_name)
      img = Image.open(file_path)

      if img.width < filter_size and img.height < filter_size:
        img.close()
        os.remove(file_path)
        print(f"{index} 이미지 제거")
        filtered_count += 1
    except OSError as e:
      print(e)
      os.remove(file_path)
      filtered_count += 1

  print(f"[이미지 제거 개수: {filtered_count}/{scraped_count}]")

In [None]:
socket.setdefaulttimeout(30)

wd = webdriver.Chrome('chromedriver', options=chrome_options)
scraped_count = 0

path = "./"
query = input("검색어 입력: ")

dir_name = path + query
os.makedirs(dir_name)
print(f"[{dir_name} 디렉토리 생성]")

scraping(dir_name, query)
filter_and_remove(dir_name, query, 400)

검색어 입력: forest
[./forest 디렉토리 생성]
[scroll_down(): 스크롤 다운 시작]
[스크롤 다운: 0]
[스크롤 다운: 1]
[스크롤 다운: 2]
[스크롤 다운: 3]
[스크롤 다운: 4]
[스크롤 다운: 5]
[스크롤 다운: 6]
[스크롤 다운: 7]
[스크롤 다운: 8]
[스크롤 다운: 9]
[스크롤 다운: 10]
[<selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f416f703", element="106301db-a502-4949-929c-816e3aa26b02")>, <selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f416f703", element="23098546-f629-4e03-8d6c-8b66323f8d13")>, <selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f416f703", element="f34a25dd-4734-480f-8bb1-50e09f5c3190")>, <selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f416f703", element="955c94a7-a00c-45fc-a0ef-4ab12ff51f8f")>, <selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f416f703", element="3ccc057e-07f1-4456-8027-cb994133ed96")>, <selenium.webdriver.remote.webelement.WebElement (session="29317866cb0e9065be4bf7a4f4

In [None]:
!ls

forest	sample_data


In [None]:
!rmdir sea

rmdir: failed to remove 'sea': No such file or directory


In [None]:
!rmdir forest

rmdir: failed to remove 'forest': Directory not empty


## 이미지 확인

In [None]:
!ls forest

100.jpg  185.jpg  266.jpg  362.jpg  452.jpg  542.jpg  632.jpg  695.jpg	751.jpg
101.jpg  186.jpg  26.jpg   364.jpg  457.jpg  545.jpg  636.jpg  699.jpg	752.jpg
110.jpg  192.jpg  273.jpg  368.jpg  459.jpg  547.jpg  637.jpg  704.jpg	753.jpg
115.jpg  193.jpg  274.png  36.jpg   45.jpg   548.png  640.jpg  713.jpg	757.jpg
116.jpg  216.jpg  27.jpg   370.jpg  461.jpg  549.jpg  641.jpg  715.jpg	758.jpg
119.jpg  217.jpg  28.jpg   383.jpg  464.jpg  54.jpg   647.jpg  716.jpg	760.jpg
128.jpg  218.jpg  294.jpg  385.jpg  465.jpg  557.jpg  651.jpg  718.jpg	761.jpg
12.jpg	 220.jpg  296.jpg  399.jpg  466.jpg  55.jpg   656.jpg  719.jpg	762.jpg
140.jpg  221.jpg  298.jpg  402.jpg  470.jpg  561.jpg  657.jpg  720.jpg	763.jpg
145.jpg  225.jpg  299.jpg  410.jpg  475.jpg  566.jpg  659.jpg  722.png	80.jpg
146.jpg  226.jpg  301.jpg  411.jpg  476.jpg  569.jpg  660.jpg  723.jpg	85.jpg
149.jpg  227.jpg  30.jpg   417.jpg  47.jpg   580.jpg  665.jpg  724.jpg	87.jpg
14.jpg	 228.jpg  312.jpg  418.jpg  490.jpg  590.jpg  666

In [None]:
from google.colab import files
files.download('./forest/14.jpg')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>