# Crawling for Images
Use a search engine to crawl for images and build the dataset. The dataset consists of four classes: ripe tomatoes, rotten, raw and not tomatoes

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
import numpy as np

## Storage setup (GDrive)
The datasets will be stored in Google Drive

In [3]:
import os

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!pip install icrawler
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler



In [5]:
os.getcwd()

'/content'

## Crawling for the images

In [0]:
# Define my own class and method, to add a prefix to the files
from icrawler import ImageDownloader

class MyImageDownloader(ImageDownloader):

#     def __init__(self, *args, **kwargs):
#         super(MyImageDownloader).__init__(*args, **kwargs)
#         self.filename_prefix = ''

    def get_filename(self, task, default_ext):
        return self.filename_prefix + super(ImageDownloader, self).get_filename(task, default_ext)

### Ripe Tomatoes

In [0]:
from pathlib import Path
import collections

# Setup the path where the images are going to be stored
dir_path = Path('gdrive/My Drive/datasets/tomato_classification/tomato_ripe')
dir_path.mkdir(parents=True, exist_ok=True)

In [39]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.jpeg': 3, '.jpg': 134, '.png': 6})

In [0]:
# Setup the crawler - Google images
google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

# A prefix, so I dont overwrite images already in the folder
google_crawler.downloader.filename_prefix = 'july18_'
google_crawler.crawl(keyword='ripe tomato', filters=filters, max_num=200, min_size=(200,200))

In [0]:
# Setup the crawler - Bing
bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader, downloader_threads=4,
                                storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

bing_crawler.downloader.filename_prefix = 'july18_'
bing_crawler.crawl(keyword='ripe tomato', filters=filters, offset=0, max_num=200)

In [43]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.jpeg': 3, '.jpg': 226, '.png': 4})

### Rotten Tomatoes

In [0]:
# Setup the path where the images are going to be stored
dir_path = Path('gdrive/My Drive/datasets/tomato_classification/tomato_rotten')
dir_path.mkdir(parents=True, exist_ok=True)

In [45]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.jpeg': 3, '.jpg': 103, '.png': 2})

In [0]:
#Crawl
google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

# A prefix, so I dont overwrite images already in the folder
google_crawler.downloader.filename_prefix = 'july18_'
google_crawler.crawl(keyword='rotten tomato', filters=filters, max_num=200, min_size=(200,200))

In [0]:
# Setup the crawler - Bing
bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader, downloader_threads=4,
                                storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

bing_crawler.downloader.filename_prefix = 'july18_'
bing_crawler.crawl(keyword='spoilt tomato', filters=filters, offset=0, max_num=200)

In [48]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.jpeg': 3, '.jpg': 201, '.png': 2})

### Unripe tomatoes
(raw doesn't work - its not the opposite of ripe)

In [0]:
# Setup the path where the images are going to be stored
dir_path = Path('gdrive/My Drive/datasets/tomato_classification/tomato_unripe')
dir_path.mkdir(parents=True, exist_ok=True)

In [50]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.JPG': 1, '.jpeg': 6, '.jpg': 131, '.png': 4})

In [0]:
#Crawl
google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

# A prefix, so I dont overwrite images already in the folder
google_crawler.downloader.filename_prefix = 'july18_'
google_crawler.crawl(keyword='unripe tomatoes', filters=filters, max_num=200, min_size=(200,200))

In [0]:
# Setup the crawler - Bing
bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader, downloader_threads=4,
                                storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

bing_crawler.downloader.filename_prefix = 'july18_'
bing_crawler.crawl(keyword='unripe tomato', filters=filters, offset=0, max_num=200)

In [53]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.JPG': 1, '.jpeg': 6, '.jpg': 235, '.png': 2})

### Not tomatoes
This is a catch all category - by definition hard to define. Including here: apples, red potatoes, onions, green jaamun, red berries, olives, peaches

In [0]:
# Setup the path where the images are going to be stored
dir_path = Path('gdrive/My Drive/datasets/tomato_classification/not_tomato')
dir_path.mkdir(parents=True, exist_ok=True)

In [55]:
collections.Counter(p.suffix for p in dir_path.glob('*.*'))

Counter({'.JPG': 1,
         '.cms': 1,
         '.jpeg': 5,
         '.jpg': 198,
         '.png': 13,
         '.webp': 1})

In [56]:
# Crawl
google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')

google_crawler.downloader.filename_prefix = 'strawberry'
google_crawler.crawl(keyword='strawberry', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_apple'
google_crawler.crawl(keyword='red apples', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'green_apple'
google_crawler.crawl(keyword='green apples', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_potato'
google_crawler.crawl(keyword='red potatoes', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'onion'
google_crawler.crawl(keyword='onions', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_berries'
google_crawler.crawl(keyword='red berries', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'jamun_green'
google_crawler.crawl(keyword='jamun green', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'peaches'
google_crawler.crawl(keyword='peaches', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'olives'
google_crawler.crawl(keyword='olives', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_plants'
google_crawler.crawl(keyword='red plants', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'green_plants'
google_crawler.crawl(keyword='green plants', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_flowers'
google_crawler.crawl(keyword='red flowers', filters=filters, max_num=20, min_size=(200,200))

2019-07-18 18:47:56,048 - INFO - icrawler.crawler - start crawling...
2019-07-18 18:47:56,049 - INFO - icrawler.crawler - starting 1 feeder threads...
2019-07-18 18:47:56,054 - INFO - feeder - thread feeder-001 exit
2019-07-18 18:47:56,054 - INFO - icrawler.crawler - starting 1 parser threads...
2019-07-18 18:47:56,065 - INFO - icrawler.crawler - starting 1 downloader threads...
2019-07-18 18:47:56,641 - INFO - parser - parsing result page https://www.google.com/search?q=strawberry&ijn=0&start=0&tbs=itp%3Aphoto%2Cic%3Acolor&tbm=isch
2019-07-18 18:47:57,213 - INFO - downloader - image #1	https://upload.wikimedia.org/wikipedia/commons/thumb/2/29/PerfectStrawberry.jpg/220px-PerfectStrawberry.jpg
2019-07-18 18:47:59,968 - INFO - downloader - image #2	https://www.aces.edu/wp-content/uploads/2019/04/strawberry-.jpg
2019-07-18 18:48:00,121 - INFO - downloader - image #3	https://thumbs-prod.si-cdn.com/k5ZCVtFyySLEDAHe7FeDQMBfQ-w=/800x600/filters:no_upscale()/https://public-media.si-cdn.com/fil