# Scraper for CoralNet

This scraper allows the download of a full dataset on CoralNet.

Datasets can also be filled with new images when more are confirmed.

For each source dowload:
- `labelset.csv`
- `metadata.csv`
- `annotations.csv`

## Import necessary libraries

In [3]:
from bs4 import BeautifulSoup
from skimage import io

import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import warnings
warnings.filterwarnings('ignore')

## Prepare directory for data download

In the `/data/jantina` drive the sources are organised as such:
```
CoralNet
│ 
└─── WAPA_RFM
│   │
│   └─── images
│   │      
│   └─── labels
│   │      
│   └─── masks    
│   │      
│   └─── output      
│   │      
│   └─── other   
│
└─── other
    │     
```

In [61]:
os.chdir("/data/jantina/CoralNet")

# create the data folder
path = "WAPA_RFM"
isExist = os.path.exists(path)
if not isExist:
    os.makedirs(path)
    print("[INFO] The new directory is created!")
    
# create an image folder
os.chdir(path)
isExist = os.path.exists("images")
if not isExist:
    os.makedirs("images")
    print("[INFO] The new image folder is created!")

# read image metadata
df = pd.read_csv('metadata.csv')
imageList = df['Name']
print('[INFO] Number of images: ', imageList.size)

Number of Images:  2640


## Get image names and links from specific source

Parse the CoralNet source code using BeautifulSoup.

In [62]:
# declare variables
searchList = imageList
names = []
links = []

# grab the link from the specific source
page_url = "https://coralnet.ucsd.edu/source/2112/browse/images/"

# grab image names and links
for i in range(1,133):
    # grab the source code
    r = requests.post(page_url,
                 data = {"csrfmiddlewaretoken": "3zCbRyJMzORpgQNzmyoUyiT99XEAQREYqYdTKxJzO6NnQEG67jvLwC9HdNy1qbSV",
                         "image_form_type": "search",
                         "photo_date_1": 2020,
                         #"annotation_status": "confirmed",
                         "last_annotated_1": 2020,
                         "sort_method": "name",
                         "sort_direction": "asc",
                         "page": i},
                 headers={'Referer': page_url,
                          "Cookie": "__utmc=209852167; __utmz=209852167.1651666647.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); csrftoken=S2NTHiiDHomFqLglSikWZwmQkIFLUkHtfroBAhiqWGiD0z9SD3rNXQCooyzcuEVq; sessionid=vyo2ksxe1ig372r6hbv1lr5ws68egxv2; __utma=209852167.1649723512.1651666647.1651666647.1651736260.2; __utmt=1; __utmb=209852167.5.10.1651736260"})
    
    # parse with BeautifulSoup
    soup = BeautifulSoup(r.text, "html.parser")
    
    # extract names
    for image in soup.find_all("img"):
        name = image["title"].split(" - Confirmed")[0]
        names.append(name)
        
        # for source with nasty names
        #if name.find(":")!=-1:
            #continue
        #else:
            #name = name.split("/")[0] + ".jpeg"
    
    # extract links
    for link in soup.find_all('a'):
        if "/image/" in link.get('href'):
            img_url = 'https://coralnet.ucsd.edu' + link.get('href')
            r = requests.get(img_url)
            soup = BeautifulSoup(r.text, "html.parser")
            div = soup.find(id = "original_image_container")
            link = div.find('img').attrs['src']
            links.append(link)        
            
print((f'[INFO] Found {len(links)} images to download.')

## Filling in only missing images

If the source is new, skip this step.

In [None]:
filenames = []
directory = os.chdir("/data/jantina/CoralNet/WAPA_RFM/images_almost_all/")

for filename in os.listdir(directory):
    filenames.append(filename)

searchList = (list(set(searchList) - set(filenames)))
print((f'[INFO] Found {len(searchList)} images to download.')

## Dowload all the images

In [None]:
# go to the image folder
os.chdir("/data/jantina/CoralNet/WAPA_RFM/images/")

k = 0
for i in range(len(names)):
    # cross reference the names with the searchList
    if names[i] in set(names) & set(searchList):
        # get the image from the URL
        r = requests.get(links[i], allow_redirects=True)

        # write it to a local file
        try:
            open(names[i], 'wb').write(r.content)
            # increment the number of images we have found
            k = k + 1
            print("downloaded image " + names[i])
        except:
            continue

print(f'[INFO] Found {k} images.')