# Custom BeautifulSoup Scraper to collect images from Unsplash

## Importing Necessary libraries

In [31]:
# Libraries for scrapping
from bs4 import BeautifulSoup
import requests
from urllib.request import urlretrieve

# Libraries for duplicate removal
import numpy as np
import hashlib
import matplotlib.gridspec as gridspec
%matplotlib inline
import time

# Libraries for colab directory handling
import os
from google.colab import files


## Scraper Function

In [4]:
# Function to modularize the scrapes
def extract(url):

  # Requesting url using urllib.requests
  req=requests.get('https://unsplash.com/s/photos/'+url)
  content=req.text

  # Beautifying the content code using BeautifulSoup
  soup=BeautifulSoup(content)

  # Filtering images using element class name 
  images = soup.find_all("img", {"class": "YVj9w"})
  images1 = soup.find_all("img", {"class": "D1hjc"})
  # Counter for counting images
  c=0
  for tags in images:

    # for s in str(tags):
    s=str(tags)
    if s.startswith('<img alt="'):

      # Getting the images
      path = s[s.find('https:'):s.find('" srcset="')]

      #  Retrieving the images and saving them locally
      urlretrieve(path,f"/content/{title}/{url}-{c}.jpg")
      c+=1

## List of the animals for modularization

In [3]:
# List for making directories on colab and specifying the search keywords 
folder_list = ['elephant','buffalo','rhino','zebra']

## Scrapping all the images

In [5]:
for title in folder_list:

  # Making folder of animal names using os library
  os.mkdir(title)

  # Custom keywords for each animal
  urls=[f'{title}s-wildlife',f'baby-{title}',f'{title}',f'{title}-family',f'{title}-africa',f'{title}-jungle']
  print(urls)
  
  # Extracting images based on each keyword
  for url in urls:
    print(url)
    extract(url)

['elephants-wildlife', 'baby-elephant', 'elephant', 'elephant-family', 'elephant-africa', 'elephant-jungle']
elephants-wildlife
baby-elephant
elephant
elephant-family
elephant-africa
elephant-jungle
['buffalos-wildlife', 'baby-buffalo', 'buffalo', 'buffalo-family', 'buffalo-africa', 'buffalo-jungle']
buffalos-wildlife
baby-buffalo
buffalo
buffalo-family
buffalo-africa
buffalo-jungle
['rhinos-wildlife', 'baby-rhino', 'rhino', 'rhino-family', 'rhino-africa', 'rhino-jungle']
rhinos-wildlife
baby-rhino
rhino
rhino-family
rhino-africa
rhino-jungle
['zebras-wildlife', 'baby-zebra', 'zebra', 'zebra-family', 'zebra-africa', 'zebra-jungle']
zebras-wildlife
baby-zebra
zebra
zebra-family
zebra-africa
zebra-jungle


## Function to print image count of each folder

In [22]:
def print_count():
  for title in folder_list:
    a = os.listdir(f'/content/{title}/')
    print(title,len(a))

## Count of images scraped per animal

In [23]:
print_count()

elephant 354
buffalo 303
rhino 297
zebra 324


## Removing Duplicates

### Custom Hash Function to hash np arrays for duplicate removal

In [9]:
def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return md5(f.read()).hexdigest()

## Changing current directory

In [16]:
os.chdir('/content')
os.getcwd()

'/content'

## Function to removal duplicates from the local directory

In [32]:
def duplicate_remove():

  # List to store tuples of duplicates
  duplicates = []

  # Hashing using the custom hash function
  hash_keys = dict()

  #listdir('.') = current directory
  for index, filename in  enumerate(os.listdir('.')):  
      if os.path.isfile(filename):
          with open(filename, 'rb') as f:
              filehash = hashlib.md5(f.read()).hexdigest()

          # Appending new entries to the hash
          if filehash not in hash_keys: 
              hash_keys[filehash] = index

          # Adding tuples of duplicates
          else:
              duplicates.append((index,hash_keys[filehash]))

  # Remoing all the duplicates from the local directory using os
  for index in duplicates:
    os.remove(file_list[index[0]])

## Looping over all the folders and removing duplicates

In [26]:
for title in folder_list:
  os.chdir(f'/content/{title}')
  print(os.getcwd())
  try:
    duplicate_remove()
  except:
    continue

/content/elephant
/content/buffalo
/content/rhino
/content/zebra


## Count of images scraped per animal after duplicacy removal

In [27]:
print_count()

elephant 90
buffalo 303
rhino 297
zebra 324


## Saving the data locally

In [29]:
!zip -r /content/elephant.zip /content/elephant
!zip -r /content/buffalo.zip /content/buffalo
!zip -r /content/rhino.zip /content/rhino
!zip -r /content/zebra.zip /content/zebra
# files.download('/content/elephant.zip')

  adding: content/elephant/ (stored 0%)
  adding: content/elephant/elephants-wildlife-44.jpg (deflated 0%)
  adding: content/elephant/elephant-jungle-27.jpg (deflated 0%)
  adding: content/elephant/elephant-5.jpg (deflated 1%)
  adding: content/elephant/elephants-wildlife-55.jpg (deflated 0%)
  adding: content/elephant/elephants-wildlife-16.jpg (deflated 1%)
  adding: content/elephant/elephant-46.jpg (deflated 1%)
  adding: content/elephant/elephant-africa-50.jpg (deflated 0%)
  adding: content/elephant/baby-elephant-27.jpg (deflated 1%)
  adding: content/elephant/elephant-53.jpg (deflated 0%)
  adding: content/elephant/elephant-51.jpg (deflated 1%)
  adding: content/elephant/elephant-family-22.jpg (deflated 0%)
  adding: content/elephant/elephants-wildlife-32.jpg (deflated 1%)
  adding: content/elephant/elephant-family-58.jpg (deflated 0%)
  adding: content/elephant/elephant-jungle-10.jpg (deflated 0%)
  adding: content/elephant/elephant-30.jpg (deflated 2%)
  adding: content/elephant