In [1]:
import json
import os
from itertools import chain
from pathlib import Path
import numpy as np

from datasets import Features, Image, Value, load_dataset, DatasetDict
from dotenv import load_dotenv
from google.cloud import storage

In [2]:
load_dotenv()

GOOGLE_PROJECT = os.getenv("GOOGLE_PROJECT")
GOOGLE_BUCKET_URL = os.getenv("GOOGLE_BUCKET_URL")
BUCKET_NAME = "thesis-gbif-mappings"

HUGGING_FACE_SOURCE_DATASET = "jkbkaiser/thesis-gbif-raw"
HUGGING_FACE_HIERARCHY_DATASET = "jkbkaiser/thesis-gbif-hierarchy"
HUGGING_FACE_FLAT_DATASET = "jkbkaiser/thesis-gbif-flat"

DATA_DIR = Path("./../../data")
GBIF_DATA_DIR = DATA_DIR / "gbif"
EXTRACT_LABELS_DIR = GBIF_DATA_DIR / "extract-labels"

In [3]:
if not EXTRACT_LABELS_DIR.exists():
    EXTRACT_LABELS_DIR.mkdir(parents=True)

In [15]:
def upload_mappings(mappings, destination_blob_name):
    source_file_name = EXTRACT_LABELS_DIR / destination_blob_name
    with open(source_file_name, "w") as f:
        f.write(json.dumps(mappings))
    
    storage_client = storage.Client(GOOGLE_PROJECT)
    bucket = storage_client.bucket(BUCKET_NAME)
    blob = bucket.blob(destination_blob_name)
    
    if blob.exists():
        print("deleted existing blob")
        blob.delete()

    blob.cache_control = "no-store,no-cache,max-age=0"
    blob.upload_from_filename(source_file_name, if_generation_match=None)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")


def upload_numpy_array(mask, destination_blob_name):
    source_file_name = EXTRACT_LABELS_DIR / destination_blob_name
    with open(source_file_name, "wb") as f:
        np.save(f, mask)
    
    storage_client = storage.Client(GOOGLE_PROJECT)
    bucket = storage_client.bucket(BUCKET_NAME)
    blob = bucket.blob(destination_blob_name)
    
    if blob.exists():
        print("deleted existing blob")
        blob.delete()

    blob.cache_control = "no-store,no-cache,max-age=0"
    blob.upload_from_filename(source_file_name, if_generation_match=None)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

In [5]:
ds = load_dataset(HUGGING_FACE_SOURCE_DATASET)["data"]
print(ds)

def species_exists(row):
    if row["species"] is None:
        return False
    return True

ds = ds.filter(species_exists)

ds = ds.remove_columns(
    [
        "kingdom_key",
        "phylum_key",
        "order_key",
        "family_key",
        "genus_key",
        "scientific_name",
        "id",
    ]
)

def extract_names(entry):
    species = entry["species"].lower()
    name_parts = species.split(" ", 1)

    if len(name_parts) != 2:
        print("Could not extract", name_parts)

    return {
        "image": entry["image"],
        "genus": name_parts[0],
        "species": species,
    }

ds = ds.map(extract_names)
print(ds)

Dataset({
    features: ['image', 'id', 'kingdom_key', 'phylum_key', 'order_key', 'family_key', 'genus_key', 'scientific_name', 'species'],
    num_rows: 999
})
Dataset({
    features: ['image', 'species', 'genus'],
    num_rows: 750
})


# Seperate labels

## Get Ids

In [6]:
def get_mapping_per_level(ds):
    ds = ds.with_format("numpy")
    uniq_species = ds.unique("species")
    id2species = {i: elem for i, elem in enumerate(uniq_species)}
    species2id = {elem: i for i, elem in enumerate(uniq_species)}

    uniq_genus = ds.unique("genus")
    id2geni = {i: elem for i, elem in enumerate(uniq_genus)}
    geni2id = {elem: i for i, elem in enumerate(uniq_genus)}

    return id2species, species2id, id2geni, geni2id

id2species, species2id, id2genus, genus2id = get_mapping_per_level(ds)

mappings= {
    "id2species": id2species,
    "species2id": species2id,
    "id2genus": id2genus,
    "genus2id": genus2id,
}

upload_mappings(mappings, "mappings_per_level.json")
print(len(id2species))
print(len(id2genus))

deleted existing blob
File ../../data/gbif/extract-labels/mappings_per_level.json uploaded to mappings_per_level.json.
257
185


In [7]:
def map_to_id(row, genus2id, species2id):
    row["genus"] = genus2id[row["genus"]]
    row["species"] = species2id[row["species"]]
    return row

heirarchy_ds = ds.map(lambda x: map_to_id(x, genus2id, species2id))
features = Features(
    {
        "image": Image(mode=None, decode=True, id=None),
        "species": Value(dtype="int32", id=None),
        "genus": Value(dtype="int32", id=None),
    }
)
heirarchy_ds = heirarchy_ds.cast(features)

In [8]:
u, c = np.unique(heirarchy_ds["species"], return_counts=True)
label = u[np.argmax(c)]
id2species[label]

'coccinella septempunctata'

In [9]:
ds_train_validtest = heirarchy_ds.train_test_split(test_size=0.2, seed=42)
ds_validtest = ds_train_validtest["test"].train_test_split(test_size=0.5, seed=42)

ds_dict = DatasetDict({
    "train": ds_train_validtest["train"],
    "valid": ds_validtest["train"],
    "test": ds_validtest["test"],
})

ds_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 600
    })
    valid: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 75
    })
    test: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 75
    })
})

In [10]:
ds_dict.push_to_hub(HUGGING_FACE_HIERARCHY_DATASET, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/jkbkaiser/thesis-gbif-hierarchy/commit/c139bfe03284d5b5628ebffa2d6b23f9cbfb6b78', commit_message='Upload dataset', commit_description='', oid='c139bfe03284d5b5628ebffa2d6b23f9cbfb6b78', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jkbkaiser/thesis-gbif-hierarchy', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jkbkaiser/thesis-gbif-hierarchy'), pr_revision=None, pr_num=None)

## Extract hierarchy

In [11]:
def extract_hierarchy(ds):
    species_to_genus = {}
    genus_to_species = {}
    
    for row in ds:
        species = row["species"]
        genus = row["genus"]

        if species not in species_to_genus:
            species_to_genus[species] = genus
        elif species_to_genus[species] != genus:
            print("mismatch", species_to_genus[species], genus)

        if genus not in genus_to_species:
            genus_to_species[genus] = [species]
        else:
            if species not in genus_to_species[genus]:
                genus_to_species[genus] += [species]

    return species_to_genus, genus_to_species

species_to_genus, genus_to_species = extract_hierarchy(heirarchy_ds)

In [36]:
num_species = len(species_to_genus)
num_genus = len(genus_to_species)
masks = []

for (key, value) in genus_to_species.items():
    mask = np.zeros(num_species)
    mask[value] = 1
    masks.append(mask)

mask = np.stack(masks)
print(mask.shape)

(185, 257)


In [37]:
upload_numpy_array(mask, "hierarchy_mask.npy")

deleted existing blob
File ../../data/gbif/extract-labels/hierarchy_mask.npy uploaded to hierarchy_mask.npy.


In [33]:
species_to_genus_arr = np.zeros(num_species, dtype=np.int32)
for key, value in species_to_genus.items():
    species_to_genus_arr[key] = value

print(species_to_genus_arr[256])
print(id2species[256])
print(id2genus[23])

23
cicindela duodecimguttata
cicindela


In [38]:
upload_numpy_array(species_to_genus_arr, "species_to_genus.npy")

deleted existing blob
File ../../data/gbif/extract-labels/species_to_genus.npy uploaded to species_to_genus.npy.


In [47]:
species_to_genus_matrix = np.zeros((num_species, num_genus), dtype=np.float32)
for species, genus in species_to_genus.items():
    species_to_genus_matrix[species, genus] = 1
print(species_to_genus_matrix.dtype)

float32


In [48]:
upload_numpy_array(species_to_genus_matrix, "species_to_genus_matrix.npy")

deleted existing blob
File ../../data/gbif/extract-labels/species_to_genus_matrix.npy uploaded to species_to_genus_matrix.npy.


# Shared labels

In [15]:
def get_flat_mapping(ds):
    ds = ds.with_format("numpy")

    uniq_genus = ds.unique("genus")
    uniq_species = ds.unique("species")
    
    combined = uniq_genus + uniq_species

    split = len(uniq_genus)
    
    id2labels = {i: elem for i, elem in enumerate(combined)}
    labels2id = {elem: i for i, elem in enumerate(combined)}

    return id2labels, labels2id, split

id2labels, labels2id, split = get_flat_mapping(ds)

flat_mappings = {
    "id2labels": id2labels,
    "labels2id": labels2id,
    "split": split,
}

upload_mappings(mappings, "flat_mapping.json")
print(split)
print(len(id2labels))

deleted existing blob
File ../../data/gbif/extract-labels/flat_mapping.json uploaded to flat_mapping.json.
185
442


In [16]:
def map_to_label(row, labels2id):
    row["genus"] = labels2id[row["genus"]]
    row["species"] = labels2id[row["species"]]
    return row

flat_ds = ds.map(lambda x: map_to_label(x, labels2id))
features = Features(
    {
        "image": Image(mode=None, decode=True, id=None),
        "species": Value(dtype="int32", id=None),
        "genus": Value(dtype="int32", id=None),
    }
)
flat_ds = flat_ds.cast(features)

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/750 [00:00<?, ? examples/s]

In [17]:
u, c = np.unique(flat_ds["species"], return_counts=True)
label = u[np.argmax(c)]
id2labels[label]

'coccinella septempunctata'

In [18]:
ds_train_validtest = flat_ds.train_test_split(test_size=0.2, seed=42)
ds_validtest = ds_train_validtest["test"].train_test_split(test_size=0.5, seed=42)

ds_dict = DatasetDict({
    "train": ds_train_validtest["train"],
    "valid": ds_validtest["train"],
    "test": ds_validtest["test"],
})

ds_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 600
    })
    valid: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 75
    })
    test: Dataset({
        features: ['image', 'species', 'genus'],
        num_rows: 75
    })
})

In [19]:
ds_dict.push_to_hub(HUGGING_FACE_FLAT_DATASET, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jkbkaiser/thesis-gbif-flat/commit/214fd745bd34a241260ac443cd88078295ddf2b2', commit_message='Upload dataset', commit_description='', oid='214fd745bd34a241260ac443cd88078295ddf2b2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jkbkaiser/thesis-gbif-flat', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jkbkaiser/thesis-gbif-flat'), pr_revision=None, pr_num=None)