In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [4]:

def fetch_observations(species_name, per_page=30, page=1):
    """
    Fetch observations for a given species name from iNaturalist.
    """
    url = "https://api.inaturalist.org/v1/observations"
    params = {
        "q": species_name,
        "per_page": per_page,
        "page": page,
        "photos": True,
        "taxon_name": species_name,
        "iconic_taxa": "Actinopterygii"  # Ray-finned fishes
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

def download_image(url, save_path):
    """
    Download an image from a URL and save it to the specified path.
    """
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
    else:
        print(f"Failed to download image: {url}")
        
def download_fish_images(species_name, download_dir, max_images=10):
    """
    Download images of a specific fish species from iNaturalist.
    """
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    observations = fetch_observations(species_name)
    if not observations:
        print("No observations found.")
        return

    count = 0
    for result in observations.get('results', []):
        if count >= max_images:
            break
        for photo in result.get('photos', []):
            image_url = photo.get('url')
            if image_url:
                # Construct the URL for the original-sized image
                original_url = image_url.replace("square", "original")
                image_id = photo.get('id')
                extension = original_url.split('.')[-1]
                species_name = species_name.replace(" ", "_")
                save_path = os.path.join(download_dir, f"inaturalist_{species_name}_{image_id}.{extension}")
                download_image(original_url, save_path)
                print(f"Downloaded: {save_path}")
                count += 1
                if count >= max_images:
                    break
                


In [8]:
PATH = "/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset"

acctual_species = [folder for folder in os.listdir(PATH) if not folder.startswith(".")]

SDD_PATH = '/Volumes/T7_shield/CryptoVision/Data/web/species'

all_species = [folder for folder in os.listdir(SDD_PATH) if not folder.startswith(".")]

In [9]:
difference = [item for item in all_species if item not in acctual_species]
difference

['Plesiopidae_Assessor_flavissimus',
 'Gobiidae_Lythrypnus_nesiotes',
 'Monacanthidae_Monacanthus_tuckeri',
 'Gobiidae_Eviota_nebulosa',
 'Gobiidae_Eviota_teresae',
 'Gobiidae_Trimma_capostriatum',
 'Gobiidae_Eviota_monostigma',
 'Dactyloscopidae_Gillellus_uranidae',
 'Gobiidae_Eviota_prasites',
 'Gobiesocidae_Acyrtus_lanthanum',
 'Pseudochromidae_Pseudoplesiops_revellei',
 'Gobiidae_Trimma_maiandros',
 'Pseudochromidae_Pseudochromis_flammicauda',
 'Gobiidae_Cryptopsilotris_batrachodes',
 'Gobiidae_Hetereleotris_vulgaris']

In [10]:
PATH = "/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset"

#for folder in os.listdir(PATH):
for folder in difference:
    if folder == ".DS_Store":
        continue
    
    print(f"==========> Downloading images for {folder}...")
    family, genus, species = folder.split("_")
    
    download_directory = f"/Volumes/T7_shield/CryptoVision/Data/inaturalist/{family}_{genus}_{species}"
    max_images_to_download = 1000
    download_fish_images(f"{genus} {species}", download_directory, max_images=max_images_to_download)
    

Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_416127847.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_416127853.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_391818292.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_390263686.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_390263666.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_390263658.png
Downloaded: /Volumes/T7_shield/CryptoVision/Data/inaturalist/Plesiopidae_Assessor_flavissimus/inaturalist_Assessor_flavissimus_365270324.jpg
Downloaded: /

In [9]:
species = "Priolepis Dawsoni"  # Replace with your target species name
download_directory = f"/Users/leonardo/Documents/Projects/cryptovision/data/raw/{species}"  # Replace with your desired local path
max_images_to_download = 1000  # Set the maximum number of images to download
download_fish_images(species, download_directory, max_images_to_download)

Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Priolepis Dawsoni/Priolepis Dawsoni_19504470.jpg


In [17]:
import requests

def fetch_observations(taxon_name, rank, per_page=30, page=1):
    """
    Fetch observations for a given taxon name and rank from iNaturalist.
    """
    url = "https://api.inaturalist.org/v1/observations"
    params = {
        "taxon_name": taxon_name,
        "rank": rank,
        "per_page": per_page,
        "page": page,
        "photos": True
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None
    
import os

def download_image(url, save_path):
    """
    Download an image from a URL and save it to the specified path.
    """
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
    else:
        print(f"Failed to download image: {url}")
        
def download_taxon_images(taxon_name, rank, download_dir, max_images=10):
    """
    Download images of a specific taxon (genus or family) from iNaturalist.
    """
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    observations = fetch_observations(taxon_name, rank)
    if not observations:
        print("No observations found.")
        return

    count = 0
    for result in observations.get('results', []):
        if count >= max_images:
            break
        for photo in result.get('photos', []):
            image_url = photo.get('url')
            if image_url:
                # Construct the URL for the original-sized image
                original_url = image_url.replace("square", "original")
                image_id = photo.get('id')
                extension = original_url.split('.')[-1]
                save_path = os.path.join(download_dir, f"{taxon_name}_{image_id}.{extension}")
                download_image(original_url, save_path)
                print(f"Downloaded: {save_path}")
                count += 1
                if count >= max_images:
                    break

In [19]:
taxon = "Gobiidae"  # Replace with your target genus or family name
rank = "family"  # Specify the rank: 'genus' or 'family'
download_directory = f"/Users/leonardo/Documents/Projects/cryptovision/data/raw/{taxon}"  # Replace with your desired local path
max_images_to_download = 1000  # Set the maximum number of images to download
download_taxon_images(taxon, rank, download_directory, max_images_to_download)

Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458919348.jpg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522284.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522299.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522315.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522325.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522334.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458522345.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458070337.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobiidae_458037628.jpeg
Downloaded: /Users/leonardo/Documents/Projects/cryptovision/data/raw/Gobiidae/Gobii

In [14]:
import requests

def fetch_species_by_genus(genus_name):
    """
    Fetch species data for a given genus from FishBase.
    """
    url = f"https://fishbase.ropensci.org/species"
    params = {
        "Genus": genus_name
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None
    
import os

def download_image(url, save_path):
    """
    Download an image from a URL and save it to the specified path.
    """
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
    else:
        print(f"Failed to download image: {url}")
        
def download_genus_images(genus_name, download_dir, max_images_per_species=5):
    """
    Download images of all species within a specific genus from FishBase.
    """
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    species_data = fetch_species_by_genus(genus_name)
    if not species_data:
        print("No species found for the given genus.")
        return

    for species in species_data.get('data', []):
        species_name = species.get('Species')
        genus = species.get('Genus')
        if species_name and genus:
            full_species_name = f"{genus} {species_name}"
            print(f"Fetching images for {full_species_name}...")
            # Construct the URL to the species summary page
            species_url = f"https://www.fishbase.se/summary/{genus}-{species_name}.html"
            # Fetch the species summary page
            response = requests.get(species_url)
            if response.status_code == 200:
                # Parse the page to find image URLs
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                image_tags = soup.find_all('img')
                count = 0
                for img in image_tags:
                    if count >= max_images_per_species:
                        break
                    img_url = img.get('src')
                    if img_url and 'thumbnails' in img_url.lower():
                        # Construct the full image URL
                        img_url = img_url.replace('Thumbnails', 'Pictures')
                        img_url = img_url.replace('tn_', '')
                        if not img_url.startswith('http'):
                            img_url = f"https://www.fishbase.se{img_url}"
                        # Determine the image file name
                        img_name = img_url.split('/')[-1]
                        save_path = os.path.join(download_dir, img_name)
                        # Download the image
                        download_image(img_url, save_path)
                        print(f"Downloaded: {save_path}")
                        count += 1
            else:
                print(f"Failed to fetch species page for {full_species_name}")
                
genus = "Eviota"  # Replace with your target genus or family name
rank = "genus"  # Specify the rank: 'genus' or 'family'
download_directory = f"/Users/leonardo/Documents/Projects/cryptovision/data/raw/fishbase/{genus}"  # Replace with your desired local path
max_images = 5  # Set the maximum number of images to download per species
download_genus_images(genus, download_directory, max_images)

Error: 403
No species found for the given genus.


In [7]:
from pptx import Presentation

def extract_text_from_pptx(pptx_path):
    prs = Presentation(pptx_path)
    slides_text = []

    # Iterate through each slide
    for slide in prs.slides:
        slide_text = []
        
        # Extract text from each shape
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)

        # Join text from all shapes on the slide
        slides_text.append("\n".join(slide_text))
    
    return slides_text

# Example usage
pptx_path = '/Volumes/T7_shield/Eviota Guide_2.pptx'
slides_text = extract_text_from_pptx(pptx_path)

# Display extracted text
for i, text in enumerate(slides_text):
    print(f"--- Slide {i+1} ---")
    print(text)
    print()


--- Slide 1 ---
LIRS field trips – Aug 2022/Oct 2023
Lizard Island Marine Field Station 

Eviota Guide from:
Taxonomy, diversity and distribution of Australia’s smallest vertebrate (SJB & CHG).

Photos: Chris Goatley, Simon Brandl, Jordan Casey, Christopher R Hemingson 
Compiled by: Kyra Jean MCipolla

Field crew:
Simon J Brandl
Christopher HR Goatley
Jordan M Casey
Joey DiBattista
Kyra Jean M Cipolla
Christopher R Hemingson
John Majoris
Christina Marconi


--- Slide 2 ---
LIRS23_0029
Eviota_zebrina
LIRS23_0030
LIRS23_0031
LIRS23_0032
LIRS23_0315
Put just “eviota” but KJ pretty sure zebrina
LIRS23_0316


--- Slide 3 ---
Eviota_infulata
W

--- Slide 4 ---
Eviota_melasma
LIRS23_0038
LIRS23_0037
LIRS23_0078
LIRS23_0107
LIRS23_0165
LIRS23_0343
LIRS23_0346
LIRS23_0347
LIRS23_0407
Put just “eviota” but KJ pretty sure melasma-yay

LIRS23_0416
LIRS23_0417

--- Slide 5 ---
LIRS23_0106
Eviota monostigma
LIRS23_0270
LIRS23_0271
LIRS23_0051
LIRS23_0355
LIRS23_0360
LIRS23_0386
LIRS23_0832

--- Slid

In [13]:
eviota_guide = {
    "Eviota_melasma":
        ['LIRS23_0038', 'LIRS23_0037', 'LIRS23_0078', 'LIRS23_0107', 'LIRS23_0165', 'LIRS23_0343', 'LIRS23_0346', 'LIRS23_0347', 'LIRS23_0407', 'LIRS23_0416', 'LIRS23_0417'],
    'Eviota_zebrina': 
        ['LIRS23_0029', 'LIRS23_0030', 'LIRS23_0031', 'LIRS23_0032', 'LIRS23_0315', 'LIRS23_0316'],
    'Eviota_monostigma':
        ['LIRS23_0106', 'LIRS23_0270', 'LIRS23_0271', 'LIRS23_0051', 'LIRS23_0355', 'LIRS23_0360', 'LIRS23_0386', 'LIRS23_0832'],
    'Eviota_maculosa':
        ['LIRS23_0081', 'LIRS23_0130', 'LIRS23_0133', 'LIRS23_0312', 'LIRS23_0313', 'LIRS23_0314', 'LIRS23_0402', 'LIRS23_0403', 'LIRS23_0472', 'LIRS23_0478', 'LIRS23_0604'],
    'Eviota_nebulosa':
        ['LIRS23_0072', 'LIRS23_0082', 'LIRS23_0111', 'LIRS23_0112', 'LIRS23_0272', 'LIRS23_0406', 'LIRS23_0419', 'LIRS23_0765', 'LIRS23_0766', 'LIRS23_0768', 'LIRS23_0831', 'LIRS23_0867', 'LIRS23_0868', 'LIRS23_0387'],
    "Eviota_atriventris":
        ['LIRS23_0047', 'LIRS23_0073', 'LIRS23_0094', 'LIRS23_0435', 'LIRS23_0481', 'LIRS23_0501', 'LIRS23_0502', 'LIRS23_0503'],
    'Eviota_prasites':
        ['LIRS23_0046', 'LIRS23_0059', 'LIRS23_0159', 'LIRS23_0344', 'LIRS23_0350', 'LIRS23_0359', 'LIRS23_0389', 'LIRS23_0390', 'LIRS23_0391', 'LIRS23_0432', 'LIRS23_0433', 'LIRS23_0434', 'LIRS23_0087', 'LIRS23_0476', 'LIRS23_0477', 'LIRS23_0055'],
    'Eviota_teresae':
    ['LIRS23_0396', 'LIRS23_0397', 'LIRS23_0629', 'LIRS23_0678', 'LIRS23_0679', 'LIRS23_0726', 'LIRS23_0727', 'LIRS23_0728', 'LIRS23_0729', 'LIRS23_0730', 'LIRS23_0764', 'LIRS23_0837'],
}

In [15]:
import shutil

main_path = "/Volumes/T7_shield/CryptoVision/Data/others/chris_images/"

for species, image_ids in eviota_guide.items():
   for image_id in image_ids:
       image_path = os.path.join(main_path, f"{image_id}.jpg")
       if os.path.exists(image_path):
         os.makedirs(f"/Volumes/T7_shield/CryptoVision/Data/eviota/{species}", exist_ok=True)
         shutil.copy(image_path, f"/Volumes/T7_shield/CryptoVision/Data/eviota/{species}/{image_id}.jpg")
       else:
           print(f"Image not found: {image_path}")

In [9]:
import os

def find_file(file_name, search_path=None):
    """
    Search for a file by name across the entire filesystem.

    Parameters:
    -----------
    file_name : str
        Name of the file to search for.
    search_path : str
        The root directory to start the search from (default is root '/' or 'C:/').

    Returns:
    --------
    list
        A list of full paths to the matching files.
    """
    if search_path is None:
        search_path = "/" if os.name != 'nt' else "C:\\"

    matches = []
    for root, dirs, files in os.walk(search_path):
        if file_name in files:
            matches.append(os.path.join(root, file_name))
    
    return matches

# Example usage

file_to_find = 'LIRS23_0397.JPG'
results = find_file(file_to_find, search_path="/Volumes/T7_shield")

if results:
    print("File(s) found:")
    for path in results:
        print(path)
else:
    print("File not found.")


File(s) found:
/Volumes/T7_shield/CryptoVision/Data/others/chris_images/LIRS23_0397.JPG
/Volumes/T7_shield/CryptoVision/Data/others/cv_organized/unknown/LIRS23_0397.JPG
