# Scraper for CoralNet

This scraper allows the download of a full dataset on CoralNet.

Datasets can also be filled with new images when more are confirmed.

For each source dowload:
- `labelset.csv`
- `metadata.csv`
- `annotations.csv`

## Import necessary libraries

In [3]:
from bs4 import BeautifulSoup
from skimage import io

import glob
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import warnings
warnings.filterwarnings('ignore')



## Prepare directory for data download

In the `/data/jantina` drive the sources are organised as such:
```
CoralNet
│ 
└─── WAPA_RFM
│   │
│   └─── images
│   │      
│   └─── labels
│   │      
│   └─── masks    
│   │      
│   └─── output      
│   │      
│   └─── other   
│
└─── other
    │     
```

In [5]:
os.chdir("/data/jantina/CoralNet/used")

# create the data folder
path = "Vavau"
isExist = os.path.exists(path)
if not isExist:
    os.makedirs(path)
    print("[INFO] The new directory is created!")
    
# create an image folder
os.chdir(path)
isExist = os.path.exists("images")
if not isExist:
    os.makedirs("images")
    print("[INFO] The new image folder is created!")

# read image metadata
df = pd.read_csv('other/metadata.csv')
imageList = df['Name']
print('[INFO] Number of images: ', imageList.size)

[INFO] Number of images:  3880


## Get image names and links from specific source

Parse the CoralNet source code using BeautifulSoup.

In [23]:
# declare variables
searchList = imageList
names = []
links = []

# grab the link from the specific source
page_url = "https://coralnet.ucsd.edu/source/841/browse/images/"

# grab image names and links
for i in range(80,81):
    # grab the source code
    r = requests.post(page_url,
                 data = {"csrfmiddlewaretoken": "THM7izCXWjOfKu1l6106C3PMTFZNOItTg6nPbyCKbBKdkiUSRM7XAn5kXvTeo2HQ",
                         #"image_form_type": "search",
                         #"photo_date_1": 2020,
                         #"annotation_status": "confirmed",
                         #"last_annotated_1": 2020,
                         #"sort_method": "name",
                         #"sort_direction": "asc",
                         "page": i},
                 headers={'Referer': page_url,
                          "Cookie": "__utmc=209852167; __utmz=209852167.1651666647.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); csrftoken=S2NTHiiDHomFqLglSikWZwmQkIFLUkHtfroBAhiqWGiD0z9SD3rNXQCooyzcuEVq; sessionid=vyo2ksxe1ig372r6hbv1lr5ws68egxv2; __utma=209852167.1649723512.1651666647.1654178964.1654182435.30; __utmt=1; __utmb=209852167.1.10.1654182435"})
    
    # parse with BeautifulSoup
    soup = BeautifulSoup(r.text, "html.parser")
    
    # extract names
    for image in soup.find_all("img"):
        name = image["title"].split(" - Confirmed")[0]
        names.append(name)
        
        # for source with nasty names
        #if name.find(":")!=-1:
            #continue
        #else:
            #name = name.split("/")[0] + ".jpeg"
    
    # extract links
    for link in soup.find_all('a'):
        if "/image/" in link.get('href'):
            img_url = 'https://coralnet.ucsd.edu' + link.get('href')
            r = requests.get(img_url)
            soup = BeautifulSoup(r.text, "html.parser")
            div = soup.find(id = "original_image_container")
            link = div.find('img').attrs['src']
            links.append(link)        
            
print((f'[INFO] Found {len(links)} images to download.'))

[INFO] Found 400 images to download.


In [11]:
for link in links:
    print(link)

https://coralnet-production.s3.amazonaws.com:443/media/images/7a41jgkbzf.JPG?Signature=v9LBaEWByqq89sRlGWtjYgqPgx0%3D&Expires=1654419826&AWSAccessKeyId=AKIAYVKEQ3B4DIOYONO3
https://coralnet-production.s3.amazonaws.com:443/media/images/so64ohwv7j.JPG?Signature=aWCrjwH6LLXWAN6r6FAr4T0YSN0%3D&Expires=1654419827&AWSAccessKeyId=AKIAYVKEQ3B4DIOYONO3
https://coralnet-production.s3.amazonaws.com:443/media/images/s4ry514te8.JPG?Signature=ey5B9uoJLYlL23%2Fo%2B3oOxAadGC8%3D&Expires=1654419828&AWSAccessKeyId=AKIAYVKEQ3B4DIOYONO3
https://coralnet-production.s3.amazonaws.com:443/media/images/t3ycjqe9hd.JPG?Signature=Dh9ktxGU2owY1TtpslhBfbEenI0%3D&Expires=1654419828&AWSAccessKeyId=AKIAYVKEQ3B4DIOYONO3
https://coralnet-production.s3.amazonaws.com:443/media/images/bqswveesw2.JPG?Signature=GxkrvGtXmFgf3O2n5immnn4iXg0%3D&Expires=1654419829&AWSAccessKeyId=AKIAYVKEQ3B4DIOYONO3
https://coralnet-production.s3.amazonaws.com:443/media/images/t9tayybctu.JPG?Signature=FzEVuYSI3ai9Xfg4aPL%2B2i5dwmg%3D&Expires=165

## Filling in only missing images

If the source is new, skip this step.

In [None]:
filenames = []
directory = os.chdir("/data/jantina/CoralNet/WAPA_RFM/images_almost_all/")

for filename in os.listdir(directory):
    filenames.append(filename)

searchList = (list(set(searchList) - set(filenames)))
print((f'[INFO] Found {len(searchList)} images to download.')

## Dowload all the images

In [24]:
# go to the image folder
os.chdir("/data/jantina/CoralNet/used/Vavau/images/")

k = 0
for i in range(len(names)):
    # cross reference the names with the searchList
    if names[i] in set(names) & set(searchList):
        # get the image from the URL
        r = requests.get(links[i], allow_redirects=True)

        # write it to a local file
        try:
            open(names[i], 'wb').write(r.content)
            # increment the number of images we have found
            k = k + 1
            print("downloaded image " + names[i])
        except:
            continue

print(f'[INFO] Found {k} images.')

downloaded image (2423).JPG
downloaded image (2424).JPG
downloaded image (2425).JPG
downloaded image (2426).JPG
downloaded image (2427).JPG
downloaded image (2428).JPG
downloaded image (2429).JPG
downloaded image (242).JPG
downloaded image (2430).JPG
downloaded image (2431).JPG
downloaded image (2432).JPG
downloaded image (2433).JPG
downloaded image (2434).JPG
downloaded image (2435).JPG
downloaded image (2436).JPG
downloaded image (2437).JPG
downloaded image (2438).JPG
downloaded image (2439).JPG
downloaded image (243).JPG
downloaded image (2440).JPG
downloaded image (2441).JPG
downloaded image (2442).JPG
downloaded image (2443).JPG
downloaded image (2444).JPG
downloaded image (2445).JPG
downloaded image (2446).JPG
downloaded image (2447).JPG
downloaded image (2448).JPG
downloaded image (2449).JPG
downloaded image (244).JPG
downloaded image (2450).JPG
downloaded image (2451).JPG
downloaded image (2452).JPG
downloaded image (2453).JPG
downloaded image (2454).JPG
downloaded image (2455)

downloaded image (2689).JPG
downloaded image (268).JPG
downloaded image (2690).JPG
downloaded image (2691).JPG
downloaded image (2692).JPG
downloaded image (2693).JPG
downloaded image (2694).JPG
downloaded image (2695).JPG
downloaded image (2696).JPG
downloaded image (2697).JPG
downloaded image (2698).JPG
downloaded image (2699).JPG
downloaded image (269).JPG
downloaded image (26).JPG
downloaded image (2700).JPG
downloaded image (2701).JPG
downloaded image (2702).JPG
downloaded image (2703).JPG
downloaded image (2704).JPG
downloaded image (2705).JPG
downloaded image (2706).JPG
downloaded image (2707).JPG
downloaded image (2708).JPG
downloaded image (2709).JPG
downloaded image (270).JPG
downloaded image (2710).JPG
downloaded image (2711).JPG
downloaded image (2712).JPG
downloaded image (2713).JPG
downloaded image (2714).JPG
downloaded image (2715).JPG
downloaded image (2716).JPG
downloaded image (2717).JPG
downloaded image (2718).JPG
downloaded image (2719).JPG
downloaded image (271).JP

## Check incorrected downloaded images

Images downloaded from broken links will have a size of 333MB.

In [25]:
dir_name = '/data/jantina/CoralNet/used/Vavau/images/'

# Get a list of files (file paths) in the given directory 
list_of_files = filter( os.path.isfile,
                        glob.glob(dir_name + '*') )

# get list of files with size
files_with_size = [ (file_path, os.stat(file_path).st_size) 
                    for file_path in list_of_files ]

# Iterate over list of tuples
for file_path, file_size in files_with_size:
    if file_size == 333:
        print(file_size, ' -->', file_path)  