In [12]:
import pandas as pd
import requests
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError

class Crawler_google_images:
    # 初始化
    def __init__(self, index, keyword):
        self.keyword = keyword
        self.url = 'https://www.google.com.hk/search?q=' + keyword + '&tbm=isch'
        self.index = index

    # 获得Chrome驱动，并访问url
    def init_browser(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--headless")  # 无头模式
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        browser = webdriver.Chrome(options=chrome_options)
        # 最大化窗口，之后需要爬取窗口中所见的所有图片
        browser.maximize_window()
        browser.get(self.url)
        return browser

    # 下载图片
    def download_images(self, browser, num_images=5):
        picpath = f'./images_/{str(self.index) + "_" + self.keyword.replace(" ", "_")}'
        # 路径不存在时创建一个
        if not os.path.exists(picpath):
            os.makedirs(picpath)
        # 记录下载过的图片地址，避免重复下载
        img_url_dic = []

        count = 0  # 图片序号
        pos = 0

        while count < num_images:
            pos += 500
            # 向下滑动
            js = 'var q=document.documentElement.scrollTop=' + str(pos)
            browser.execute_script(js)
            time.sleep(1)
            # 找到图片
            img_elements = browser.find_elements(By.TAG_NAME, 'img')
            # 遍历抓到的webElement
            for img_element in img_elements:
                if count >= num_images:
                    break
                img_url = img_element.get_attribute('src')
                # 前几个图片的url太长，不是图片的url，先过滤掉，爬后面的
                if isinstance(img_url, str):
                    if len(img_url) <= 200:
                        # 将干扰的google图标筛去
                        if 'images' in img_url:
                            # 判断是否已经爬过，因为每次爬取当前窗口，或许会重复
                            if img_url not in img_url_dic:
                                try:
                                    img_url_dic.append(img_url)
                                    # 下载并保存图片到当前目录下
                                    filename = f"./images_/{str(self.index) + '_' + self.keyword.replace(' ', '_')}/" + str(count) + ".jpg"
                                    r = requests.get(img_url, timeout=10)
                                    with open(filename, 'wb') as f:
                                        f.write(r.content)
                                    count += 1
                                    print(f'This is {count} image for {self.keyword}')
                                    # 防止反爬机制
                                    time.sleep(0.2)
                                except Exception as e:
                                    print(f'Failed to download image {img_url}: {e}')
    
    def run(self, num_images=5):
        try:
            browser = self.init_browser()
            self.download_images(browser, num_images)
        except Exception as e:
            print(f"Error occurred for keyword '{self.keyword}': {e}")
        finally:
            if browser:
                browser.close()
            print(f"Images for {self.keyword} downloaded successfully.")

def download_images_for_subject(index, subject, num_images_to_download, timeout):
    print(f"Downloading images for subject: {subject}")
    crawler = Crawler_google_images(index=index, keyword=subject)
    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(crawler.run, num_images=num_images_to_download)
        try:
            future.result(timeout=timeout)
        except TimeoutError:
            print(f"Timeout occurred for subject '{subject}'")
        except Exception as e:
            print(f"Error occurred for subject '{subject}': {e}")

if __name__ == '__main__':
    num_images_to_download = 10  # 每个主题要下载的图片数量
    timeout = 60  # 每个主题下载的超时时间（秒）

    # 从CSV文件中读取数据
    csv_file_path = '/opt/tiger/trl/consistency/git-demo/data/CF-triples_labeled.csv'
    df = pd.read_csv(csv_file_path)
    start_index = 3460
    end_index = 1000
    # 取前1000个subject
    subjects = df['subject'].unique()[start_index:]

    index = start_index
    for subject in subjects:
        try:
            download_images_for_subject(index, subject, num_images_to_download, timeout)
        except Exception as e:
            print(f"Skipping subject '{subject}' due to error: {e}")
        index += 1

    print("All images downloaded successfully.")

Downloading images for subject: G. K. Chesterton


This is 1 image for G. K. Chesterton
This is 2 image for G. K. Chesterton
This is 3 image for G. K. Chesterton
This is 4 image for G. K. Chesterton
This is 5 image for G. K. Chesterton
This is 6 image for G. K. Chesterton
This is 7 image for G. K. Chesterton
This is 8 image for G. K. Chesterton
This is 9 image for G. K. Chesterton
This is 10 image for G. K. Chesterton
Images for G. K. Chesterton downloaded successfully.
Downloading images for subject: Buffalo Bisons
This is 1 image for Buffalo Bisons
This is 2 image for Buffalo Bisons
This is 3 image for Buffalo Bisons
This is 4 image for Buffalo Bisons
This is 5 image for Buffalo Bisons
This is 6 image for Buffalo Bisons
This is 7 image for Buffalo Bisons
This is 8 image for Buffalo Bisons
This is 9 image for Buffalo Bisons
This is 10 image for Buffalo Bisons
Images for Buffalo Bisons downloaded successfully.
Downloading images for subject: Afghanistan
This is 1 image for Afghanistan
This is 2 image for Afghanistan
This is 3 image for

In [None]:
# date：2020.5.25
# author:pmy
# aim:爬取google图片
#问题在于，不能保证所爬为所见

from selenium import webdriver
import time
import os
import requests

# 修改keyword便可以修改搜索关键词 建议也修改存储目录
keyword = 'cat'
url = 'https://www.google.com.hk/search?q=' + keyword + '&source=lnms&tbm=isch'


class Crawler_google_images:
    # 初始化
    def __init__(self):
        self.url = url

    # 获得Chrome驱动，并访问url
    def init_browser(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-infobars")
        browser = webdriver.Chrome(options=chrome_options)
        # 访问url
        browser.get(self.url)
        # 最大化窗口，之后需要爬取窗口中所见的所有图片
        browser.maximize_window()
        return browser

    # 下载图片
    def download_images(self, browser, num=5):
        #存储路径
        picpath = './cat'
        # 路径不存在时创建一个
        if not os.path.exists(picpath): os.makedirs(picpath)

        count = 0  # 图片序号
        pos = 0
        # print(num)

        while (True):
            try:
                # 向下滑动
                js = 'var q=document.documentElement.scrollTop=' + str(pos)
                pos += 500
                browser.execute_script(js)
                time.sleep(1)
                # 找到图片
                # html = browser.page_source#也可以抓取当前页面的html文本，然后用beautifulsoup来抓取
                # 直接通过tag_name来抓取是最简单的，比较方便
                img_elements = browser.find_elements_by_xpath('//a[@class="wXeWr islib nfEiy mM5pbd"]')
                try:
                    for img_element in img_elements:
                        #点开大图页面
                        img_element.click()
                        time.sleep(0.5)
                        try:
                            # 这里balabala里面有好几个，所以要过滤一下
                            # 取名好烦哦···
                            balabalas = browser.find_elements_by_xpath('//img[@class="n3VNCb"]')

                            if (balabalas):
                                for balabala in balabalas:
                                    src = balabala.get_attribute('src')
                                    #过滤掉缩略图和无关干扰信息
                                    if src.startswith('http') and not src.startswith(
                                            'https://encrypted-tbn0.gstatic.com'):
                                        print('Found' + str(count) + 'st image url')
                                        # img_url_dic.append(src)
                                        self.save_img(count, src, picpath)
                                        count += 1
                                        #爬取到指定数量图片后退出
                                        if (count >= num):
                                            return "stop"
                        except:
                            print('获取图片失败')

                    #回退
                    browser.back()
                    time.sleep(0.3)
                except:
                    print('获取页面失败')
            except:
                print("划不动了")

    def save_img(self, count, img_src, picpath):
        filename = picpath + '/' + str(count) + '.jpg'
        r = requests.get(img_src)
        with open(filename, 'wb') as f:
            f.write(r.content)
        f.close()

    def run(self):
        self.__init__()
        browser = self.init_browser()
        self.download_images(browser, 100)  # 可以修改爬取的图片数
        browser.close()
        print("############爬取完成")


if __name__ == '__main__':
    craw = Crawler_google_images()
    craw.run()

