In [2]:
import base64

from icrawler import ImageDownloader
from icrawler.builtin import GoogleImageCrawler
from six.moves.urllib.parse import urlparse

Downloader to save filenames as base64 urls instead of numbers

In [3]:
class MyImageDownloader(ImageDownloader):

    def get_filename(self, task, default_ext):
        url_path = urlparse(task['file_url'])[2]
        if '.' in url_path:
            extension = url_path.split('.')[-1]
            if extension.lower() not in [
                    'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm'
            ]:
                extension = default_ext
        else:
            extension = default_ext
        # works for python3
        filename = base64.b64encode(url_path.encode()).decode()
        return '{}.{}'.format(filename, extension)

In [16]:
google_crawler_real = GoogleImageCrawler(
    downloader_cls=MyImageDownloader,
    downloader_threads=4, 
    storage = {'root_dir': r'Smiles/Real'}
)
google_crawler_real.crawl(keyword = 'crows feet smile', max_num = 500)

2023-05-23 19:40:12,325 - INFO - icrawler.crawler - start crawling...
2023-05-23 19:40:12,326 - INFO - icrawler.crawler - starting 1 feeder threads...
2023-05-23 19:40:12,328 - INFO - feeder - thread feeder-001 exit
2023-05-23 19:40:12,330 - INFO - icrawler.crawler - starting 1 parser threads...
2023-05-23 19:40:12,333 - INFO - icrawler.crawler - starting 4 downloader threads...
2023-05-23 19:40:13,307 - INFO - parser - parsing result page https://www.google.com/search?q=crows+feet+smile&ijn=0&start=0&tbs=&tbm=isch
2023-05-23 19:40:13,356 - INFO - downloader - skip downloading file L3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDIxLzA0L01EU1VOX19MYXVnaC1MaW5lcy1hbmQtQ3Jvd3MtRmVldC1DYW4tQmUtQ29tYmF0ZWQucG5n.png
2023-05-23 19:40:13,359 - INFO - downloader - skip downloading file L2kvcGl4L3NjYWxlZC8yMDEzLzAyLzAxL2FydGljbGUtMjI3MjI0MC0xNzI4NDhENDAwMDAwNURDLTY0OV8zMDh4MTg1LmpwZw==.jpg
2023-05-23 19:40:13,362 - INFO - downloader - skip downloading file L2FzYy93cC1jb250ZW50L3VwbG9hZHMvMjAxOS8wNS9DYW4teW91LWdld

2023-05-23 19:40:14,791 - INFO - downloader - skip downloading file L2Jsb2cvd3AtY29udGVudC91cGxvYWRzLzIwMjIvMTIvQ3Jvd3MtRmVldC1CbG9nLVYyLmpwZw==.jpg
2023-05-23 19:40:14,792 - INFO - downloader - skip downloading file L3MvZmlsZXMvMS8wNzE5LzkxMDcvZmlsZXMvQmxvZ181YjQzMjliMS04YmQ1LTQ5NTgtYTJiMi1mZDQ2MjA3ZTEwYzZfMTAyNHgxMDI0LmpwZw==.jpg
2023-05-23 19:40:14,793 - INFO - downloader - skip downloading file L21lZGlhL3d5c2l3eWcvRGFsdG9uLU1hZ2F6aW4tS3JfaGVuZl9lLUhlYWRlcl9Nb2JpbGUuanBn.jpg
2023-05-23 19:40:14,794 - INFO - downloader - skip downloading file L3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDIzLzAzLzA2MTUxNzI4L0JPVE9YLUNyb3dzLUZlZXQtQmVmb3JlLUFmdGVyLUNoYXJsb3R0ZS5qcGc=.jpg
2023-05-23 19:40:14,795 - INFO - downloader - skip downloading file L3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDE5LzA2LzIuanBn.jpg
2023-05-23 19:40:15,299 - INFO - downloader - image #49	https://images.squarespace-cdn.com/content/v1/5efa73cb497a1b335455a4a3/1594059762033-4IHZ4ZSG08AH3TH74JWL/Levon-BA-Screenshot-004321.jpg
2023-05-23 19:40:15,301 - 

2023-05-23 19:40:32,459 - ERROR - downloader - Response status code 400, file https://i0.wp.com/thewrightobgyn.com/wp-content/uploads/2017/12/alexandra_side_after.jpg
2023-05-23 19:40:32,812 - INFO - parser - parsing result page https://www.google.com/search?q=crows+feet+smile&ijn=2&start=200&tbs=&tbm=isch
2023-05-23 19:40:33,559 - INFO - parser - parsing result page https://www.google.com/search?q=crows+feet+smile&ijn=3&start=300&tbs=&tbm=isch
2023-05-23 19:40:34,392 - INFO - parser - parsing result page https://www.google.com/search?q=crows+feet+smile&ijn=4&start=400&tbs=&tbm=isch
2023-05-23 19:40:34,532 - INFO - downloader - image #72	https://uploads.dailyvanity.sg/wp-content/uploads/2019/12/get-rid-of-crows-feet-smile.jpg
2023-05-23 19:40:35,182 - INFO - downloader - image #73	https://contourclinics.com.au/wp-content/uploads/How-many-botox-units-.jpg
2023-05-23 19:40:35,867 - INFO - downloader - image #74	https://www.roxspanewportbeach.com/wp-content/uploads/2018/04/Screen-Shot-202

In [14]:
google_crawler_fake = GoogleImageCrawler(
    downloader_cls=MyImageDownloader,
    parser_threads=4,
    downloader_threads=4, 
    storage = {'root_dir': r'Smiles/Fake'}
)
google_crawler_fake.crawl(keyword = 'smiling selfie', max_num = 500)

2023-05-23 18:49:28,843 - INFO - icrawler.crawler - start crawling...
2023-05-23 18:49:28,844 - INFO - icrawler.crawler - starting 1 feeder threads...
2023-05-23 18:49:28,846 - INFO - feeder - thread feeder-001 exit
2023-05-23 18:49:28,849 - INFO - icrawler.crawler - starting 4 parser threads...
2023-05-23 18:49:28,855 - INFO - icrawler.crawler - starting 4 downloader threads...
2023-05-23 18:49:29,664 - INFO - parser - parsing result page https://www.google.com/search?q=smiling+selfie&ijn=2&start=200&tbs=&tbm=isch
2023-05-23 18:49:29,723 - INFO - downloader - skip downloading file L2ltYWdlLXBob3RvL2hhbmRzb21lLWJydW5ldC15b3VuZy1tYW4tbWFraW5nLTI2MG53LTY1MDQxMjYyNS5qcGc=.jpg
2023-05-23 18:49:29,727 - INFO - downloader - skip downloading file LzczNngvNmIvYzMvY2YvNmJjM2NmODIzYWM0MWJhYzYzM2U3ODUwMTVkN2UxMDQuanBn.jpg
2023-05-23 18:49:29,730 - INFO - downloader - skip downloading file L2EvMDBqMjAwL3o5LzY0OTYzNi5qcGc=.jpg
2023-05-23 18:49:29,736 - INFO - downloader - skip downloading file L3dw

2023-05-23 18:49:30,942 - INFO - downloader - skip downloading file L3YyL2pwZy8wMi85Ny83OS80OS8xMDAwX0ZfMjk3Nzk0OTk3XzJlZUFDTTlMVTNud0pkWXdBdzBiamY3TnEyNWxqTFViLmpwZw==.jpg
2023-05-23 18:49:30,943 - INFO - downloader - skip downloading file L2luY29taW5nL2FydGljbGUyNDMyMDQ2NS5lY2UvQUxURVJOQVRFUy9zMTIwMGMvMV9TY3JlZW5zaG90LTIwMjEtMDYtMTUtYXQtMDg1MjU3LnBuZw==.png
2023-05-23 18:49:30,945 - INFO - downloader - skip downloading file L2libmxpdmUvdXBsb2Fkcy8yMDIyLzA2L2FudXNoa2Etc2hhcm1hLTEuanBn.jpg
2023-05-23 18:49:30,946 - INFO - downloader - skip downloading file L21lZGlhL0JlQ0dSVUZDY0FJejN2Zi5qcGc=.jpg
2023-05-23 18:49:30,947 - INFO - downloader - skip downloading file L2ZyZWUtcGhvdG8vcG9ydHJhaXQtY3V0ZS1naXJsLXdpdGgtbG9uZy1oYWlyLXNub3ctd2hpdGUtc21pbGUtbWFraW5nLXNlbGZpZS1zdHJlZXQtY2l0eS1zaGUtd2VhcnMtdmlub3VzLWxpcHMtc21pbGluZ18xOTc1MzEtNjM4LmpwZw==.jpg
2023-05-23 18:49:30,948 - INFO - downloader - skip downloading file L3YyL2pwZy8wMi83Mi80My80Mi8xMDAwX0ZfMjcyNDM0MjA3X0hldTVVb3kwMkppQVpUMVRVb3N

2023-05-23 18:49:37,746 - INFO - downloader - skip downloading file L29yaWdpbmFscy9hYy85MS9iZC9hYzkxYmRhZTc4N2M0MDY3MzlhODQxZjQ3ZTIxMjBmNy5qcGc=.jpg
2023-05-23 18:49:37,747 - INFO - downloader - skip downloading file L3hzOGxnOHNuNXJ4MjEuanBn.jpg
2023-05-23 18:49:37,748 - INFO - downloader - skip downloading file L2Fzc2V0cy91cGxvYWRzLzIwMjMvMDQvV29tYW4tdGFraW5nLXNlbGZpZS5qcGc=.jpg
2023-05-23 18:49:37,749 - INFO - downloader - skip downloading file L1Bydi9JbWFnZXMvUGFnZXMvUGFnZV8xNTAzNDQvc2VsZmllLXNlbGZpZWRheS1zbWlsZS1iZWFyZC1tZW4tbG92ZWxpZmUtaW5zdGEtMTEtMjItMjAxNy02LTU2LTE1LXBtLWwuanBn.jpg
2023-05-23 18:49:37,750 - INFO - downloader - skip downloading file L25vdy93cC1jb250ZW50L3VwbG9hZHMvUG9vamEtSGVnZGUtc2VsZmllLXdpdGgtYmVhdXRpZnVsLXNtaWxlLmpwZw==.jpg
2023-05-23 18:49:37,934 - INFO - parser - no more page urls for thread parser-002 to parse
2023-05-23 18:49:37,935 - INFO - parser - thread parser-002 exit
2023-05-23 18:49:38,016 - ERROR - downloader - Response status code 404, file https