# Crawling for Images
Use a search engine to crawl for images and build the dataset. The dataset consists of four classes: ripe tomatoes, rotten, raw and not tomatoes

## Storage setup (GDrive)
The datasets will be stored in Google Drive

In [1]:
import os

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install icrawler
from icrawler.builtin import GoogleImageCrawler

Collecting icrawler
  Downloading https://files.pythonhosted.org/packages/78/a2/1ac26a2c39b87bef4ef8cb39cb4f33e04041f4a9f04f8cc3dfa1251e0304/icrawler-0.6.2-py2.py3-none-any.whl
Installing collected packages: icrawler
Successfully installed icrawler-0.6.2


In [3]:
os.getcwd()

'/content'

## Crawling for the images

### Ripe Tomatoes

In [0]:
dir_path = 'gdrive/My\ Drive/datasets/tomato_ripe'
if (not os.path.exists(dir_path)):
    !mkdir -p {dir_path}

In [0]:
google_crawler = GoogleImageCrawler(storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')
google_crawler.crawl(keyword='ripe tomato', filters=filters, max_num=200, min_size=(200,200))

### Rotten Tomatoes

In [0]:
dir_path = 'gdrive/My Drive/datasets/tomato_rotten'
if (not os.path.exists(dir_path)):
    !mkdir -p {dir_path}

In [0]:
google_crawler = GoogleImageCrawler(storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')
google_crawler.crawl(keyword='rotten tomato', filters=filters, max_num=200, min_size=(200,200))

### Unripe tomatoes
(raw doesn't work - its not the opposite of ripe)

In [0]:
dir_path = 'gdrive/My Drive/datasets/tomato_unripe'
if (not os.path.exists(dir_path)):
    !mkdir -p {dir_path}

In [0]:
google_crawler = GoogleImageCrawler(storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')
google_crawler.crawl(keyword='unripe tomatoes', filters=filters, max_num=200, min_size=(200,200))

### Not tomatoes
This is a catch all category - by definition hard to define. Including here: apples, red potatoes, onions, green jaamun, red berries, olives, peaches

In [0]:
dir_path = 'gdrive/My Drive/datasets/not_tomato'
if (not os.path.exists(dir_path)):
    !mkdir -p {dir_path}

In [0]:
from icrawler import ImageDownloader

class MyImageDownloader(ImageDownloader):

#     def __init__(self, *args, **kwargs):
#         super(MyImageDownloader).__init__(*args, **kwargs)
#         self.filename_prefix = ''

    def get_filename(self, task, default_ext):
        return self.filename_prefix + super(ImageDownloader, self).get_filename(task, default_ext)

In [0]:
google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, storage={'root_dir': dir_path})
filters = dict(type='photo', color='color')
google_crawler.downloader.filename_prefix = 'red_apple'
google_crawler.crawl(keyword='red apples', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'green_apple'
google_crawler.crawl(keyword='green apples', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_potato'
google_crawler.crawl(keyword='red potatoes', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'onion'
google_crawler.crawl(keyword='onions', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_berries'
google_crawler.crawl(keyword='red berries', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'jamun_green'
google_crawler.crawl(keyword='jamun green', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'peaches'
google_crawler.crawl(keyword='peaches', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'olives'
google_crawler.crawl(keyword='olives', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_plants'
google_crawler.crawl(keyword='red plants', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'green_plants'
google_crawler.crawl(keyword='green plants', filters=filters, max_num=20, min_size=(200,200))

google_crawler.downloader.filename_prefix = 'red_flowers'
google_crawler.crawl(keyword='red flowers', filters=filters, max_num=20, min_size=(200,200))