In [None]:
import yaml
import requests

# Download the YAML file
file_url = "https://raw.githubusercontent.com/NFDI4BIOIMAGE/training/main/resources/nfdi4bioimage.yml"
response = requests.get(file_url)

# Parse YAML
data = yaml.safe_load(response.content)

# Count all entries and those without license
total_entries = 0
entries_without_license = []
entries_with_license = []

for entry in data.get("resources", []):
    total_entries += 1
    if not entry.get("license"):
        entries_without_license.append(entry.get("url", "No URL"))
    else:
        entries_with_license.append((entry.get("url", "No URL"), entry.get("license")))

print(f"Total entries: {total_entries}")
print(f"Entries without license: {len(entries_without_license)}")
print(f"Entries with license: {len(entries_with_license)}")
print("\nEntries without license:")
for url in entries_without_license:
    print(f"- {url}")
print("\nEntries with license:")
for url, license in entries_with_license:
    print(f"- {url} (License: {license})")




Total entries: 756
Entries without license: 76
Entries with license: 680

Entries without license:
- https://focalplane.biologists.com/2023/07/26/sharing-your-poster-on-figshare/
- https://datamanagement.hms.harvard.edu/news/promoting-data-management-nikon-imaging-center-and-cell-biology-microscopy-facility
- https://blog.delmic.com/data-handling-in-large-scale-electron-microscopy
- https://focalplane.biologists.com/2023/06/01/tracking-in-napari/
- https://focalplane.biologists.com/2023/05/03/feature-extraction-in-napari/
- https://focalplane.biologists.com/2023/03/30/annotating-3d-images-in-napari/
- https://focalplane.biologists.com/2023/04/13/quality-assurance-of-segmentation-results/
- https://focalplane.biologists.com/2023/03/02/rescaling-images-and-pixel-anisotropy/
- https://forum.image.sc/t/user-friendly-image-metadata-annotation-tool-workflow-for-omero/87925/1
- https://focalplane.biologists.com/2024/04/03/how-to-write-a-bug-report/
- https://focalplane.biologists.com/2024/07/

In [None]:
# Count all entries without a 'type' tag and count those that are Zenodo PDF links to actual PDF files
#TODO: make sure that pds are slides and not text files
from caching import extract_zenodo_ids, get_zenodo_pdfs

total_entries = 0
entries_without_type = []
entries_with_type = []
entries_without_type_zenodo_pdf = []
test = []

for entry in data.get("resources", []):
    total_entries += 1
    url = entry.get("url", "No URL")
    if not entry.get("type"):
        entries_without_type.append(url)
        zenodo_ids = extract_zenodo_ids(url)
        if zenodo_ids:
            # Check if the Zenodo record has at least one PDF file
            for record_id in zenodo_ids:
                pdf_files = get_zenodo_pdfs(record_id)
                if pdf_files:
                    entries_without_type_zenodo_pdf.append(url)
                    break
    else:
        entries_with_type.append((url, entry.get("type")))

print(f"Total entries: {total_entries}")
print(f"Entries with type: {len(entries_with_type)}")
print(f"Entries without type: {len(entries_without_type)}")
print(f"Entries without type that are Zenodo PDF links: {len(entries_without_type_zenodo_pdf)}")

print("\nEntries without type that are Zenodo PDF links:")
for url in entries_without_type_zenodo_pdf:
    print(f"- {url}")

Failed to fetch Zenodo record 14729452, skipping...
Failed to fetch Zenodo record 14845059, skipping...
Failed to fetch Zenodo record 14832855, skipping...
Failed to fetch Zenodo record 13880367, skipping...
Failed to fetch Zenodo record 14917722, skipping...
Failed to fetch Zenodo record 14913673, skipping...
Failed to fetch Zenodo record 14893791, skipping...
Failed to fetch Zenodo record 14053758, skipping...
Failed to fetch Zenodo record 14043615, skipping...
Failed to fetch Zenodo record 14937632, skipping...
Failed to fetch Zenodo record 14933318, skipping...
Failed to fetch Zenodo record 14944040, skipping...
Failed to fetch Zenodo record 15031842, skipping...
Failed to fetch Zenodo record 15026373, skipping...
Failed to fetch Zenodo record 14988921, skipping...
Failed to fetch Zenodo record 14979253, skipping...
Failed to fetch Zenodo record 14975462, skipping...
Failed to fetch Zenodo record 14909526, skipping...
Failed to fetch Zenodo record 15001649, skipping...
Failed to fe

In [None]:
# RAW CELL: 
# try for later 
# Check for URLs that point directly to a .pdf file
# man könnte auch noch abfangen ob von github etc.
# if isinstance(url, str) and url.lower().endswith(".pdf"):
#     entries_without_type_zenodo_pdf.append(url)
#     test.append(url)

### Caching the Embeddings for each Presentation Slide in our Training Material 
For each Zenodo Record in our Resources "nfdi4bioimage.yml" File, we can generate different embeddings (visual, text and mixed). 
Instead of Calculating it over and over again for different tasks, we can calculate the Embeddings once and store them somewhere (e.g. via Huggingface) to load them again at any time.
For now, the Embeddings are stored as a Dataset on Huggingface.

To get started, you have to choose between using the free Github Models or your own API key (from OpenAI). For that, adjust the variables __use_gh_models__ and __use_openai__ in the cell below. If neccessary, also edit the name of the API Key (as stored in your environment).

In [1]:
import sys
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from huggingface_hub import login
import os

#Authenticate your current session
login(token=os.getenv("HF_TOKEN"))

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Decide on whether to use GH Models or OpenAI API
use_gh_models = True
use_openai = False

if use_gh_models:
    token = os.environ["GITHUB_TOKEN"]
    
if use_openai:
    token = os.environ["OPENAI_API_KEY"]

1. Create lists of valid licenses to filter out unwanted entries

In [7]:
valid_licenses = [
    'cc-by-3.0',
    'mozilla public license 2.0',
    'cc0 1.0 universal',
    'cc0-1.0',
    'apache-2.0',
    'bsd 3-clause',
    'mit-license',
    'odc-by-1.0',
    'apache license 2.0',
    'cc-by-3.0 unported',
    'cc-by-nc-3.0 unported',
    'cc0',
    'bsd3-clause',
    'cc-zero',
    'public domain',
    'mit license',
    'bsd 3-clause "new" or "revised" license',
    'cc0 (mostly, but can differ depending on resource)',
    'cc-by-4.0 international',
    'bsd-3-clause',
    'cc-by-nc-4.0',
    'creative commons attribution 4.0 international',
    'mit',
    'cc-by-4.0',
    'bsd-2-clause',
    'academic free license version 3.0',
    'creative commons attribution 3.0 (cc by 3.0) license',
    'cc-by-4.0',
    'bsd-3-clause',
]

In [5]:
unclear_licenses = ['custom license', 'unlicensed', 'nan', 'none', 'unknown', 'other-open', 'unkown']

2. Load all Zenodo Record IDs from our Training Material

In [6]:
from caching import get_zenodo_ids_from_yaml
import requests

file_url = "https://raw.githubusercontent.com/NFDI4BIOIMAGE/training/main/resources/nfdi4bioimage.yml"
yaml_file = "nfdi4bioimage.yml" 
response = requests.get(file_url)

# Download the current Training Material yaml file from the Git Repository
with open(yaml_file, "wb") as file:
    file.write(response.content)
print(f"File downloaded successfully as {yaml_file}")

# Extract the Zenodo Record IDs
zenodo_ids = get_zenodo_ids_from_yaml(yaml_file, valid_licenses, unclear_licenses)
print(f"Found {len(zenodo_ids)} Zenodo records: {zenodo_ids}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


File downloaded successfully as nfdi4bioimage.yml
Found 39 Zenodo records: ['10008464', '10008465', '10083555', '10654775', '10679054', '10687658', '10793699', '10815329', '10816895', '10886749', '10939519', '10942559', '10970869', '10972692', '10990107', '11031746', '11066250', '11107798', '11265038', '11396199', '11472148', '11474407', '11548617', '12623730', '3778431', '4317149', '4328911', '4330625', '4334697', '4461261', '4630788', '4748510', '4748534', '4778265', '8070038', '8323588', '8329305', '8329306', '8414318']


3. Calculate and save all embeddings in a Huggingface Dataset

In [3]:
from caching import ensure_repo_exists, load_cache_dataset, append_rows_to_dataset, embed_text, embed_visual, embed_mixed, cache_hf
from datasets import concatenate_datasets, load_dataset

repo_name = "ScaDS-AI/SlightInsight_Cache"
for record_id in zenodo_ids:
    cache_hf(record_id, token, use_openai, repo_name)

Repository 'haesleinhuepf/SlightInsight_Cache' created.
Processing 2023-Moore-N4BI-AHM-Welcome.pdf from Zenodo Record 10008464


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10008464.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/510 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/874k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53 [00:00<?, ? examples/s]

Processing 2023-Moore-N4BI-AHM-Welcome.pdf from Zenodo Record 10008465


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10008465.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/106 [00:00<?, ? examples/s]

Processing Bio-Image_Data_Strudel_TU-Dresden_TP_Workshop_2023.pdf from Zenodo Record 10083555


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10083555.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/115 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Finished processing Zenodo Record 10654775.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.
Processing ECBIA_2024_EF.pdf from Zenodo Record 10679054


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10679054.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126 [00:00<?, ? examples/s]

Processing SWAT4HCLS_2024_v11.pdf from Zenodo Record 10687658


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10687658.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Finished processing Zenodo Record 10793699.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.
Processing LLMs_BIA_v3.pdf from Zenodo Record 10815329


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10815329.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/179 [00:00<?, ? examples/s]

Processing Cultivating Open Training_v3.pdf from Zenodo Record 10816895


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10816895.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/214 [00:00<?, ? examples/s]

Processing FLIMfit_GerBi-FLIM-2024_Anca Margineanu.pdf from Zenodo Record 10886749
Processing FLUTE-software_GerBI-FLIM-2024_Chiara-Stringari.pdf from Zenodo Record 10886749
Processing napari-FLIM-phasor-plotter_GerBI-FLIM-2024_Conni-Wetzker+Marcelo-Zoccoler.pdf from Zenodo Record 10886749


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10886749.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/280 [00:00<?, ? examples/s]

Processing 20240410_NFDI4BIOIMAGE_RDM_Bio-Medicine_Schmidt.pdf from Zenodo Record 10939519


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10939519.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.50M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/289 [00:00<?, ? examples/s]

Processing Stefano_Della_Chiesa_2nd_SaxFDM-Beratungsstammtisch_v01.pdf from Zenodo Record 10942559


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10942559.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.75M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306 [00:00<?, ? examples/s]

Processing Intro_DM_de.pdf from Zenodo Record 10970869


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10970869.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/338 [00:00<?, ? examples/s]

Processing DataWeek_git_de.pdf from Zenodo Record 10972692


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10972692.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.04M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/413 [00:00<?, ? examples/s]

Processing DataWeek_Sharing+Licensing.pdf from Zenodo Record 10990107


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 10990107.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.33M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/527 [00:00<?, ? examples/s]

Processing nfdi-mpg_bioimage_20240418.pdf from Zenodo Record 11031746


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11031746.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/553 [00:00<?, ? examples/s]

Processing Cultivating_open_training_v6.pdf from Zenodo Record 11066250


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11066250.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/577 [00:00<?, ? examples/s]

Processing 2024-04-19_data-week_ws-datenorganisation.pdf from Zenodo Record 11107798


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11107798.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/514 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.45M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/626 [00:00<?, ? examples/s]

Processing MICROSAM_TALK_HUMAN_TECHNOPOLE_MAY_24.pdf from Zenodo Record 11265038
Processing MICROSAM_TALK_SWISSBIAS_APR_24.pptx.pdf from Zenodo Record 11265038


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11265038.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/515 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/770 [00:00<?, ? examples/s]

Processing 20240528_datenschutz_vertiefungsworkshop.pdf from Zenodo Record 11396199


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11396199.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/517 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/815 [00:00<?, ? examples/s]

Processing 2024-06-04_FDM-Vertiefungskurs_Urheberrecht.pdf from Zenodo Record 11472148


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11472148.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/517 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/856 [00:00<?, ? examples/s]

Processing GuideToFAIRBioImageData24_practical_tasks.pdf from Zenodo Record 11474407


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11474407.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/517 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/864 [00:00<?, ? examples/s]

Processing S2-1_Simon_Parker-Research_Data_Protection_and_Data_Sharing_via_GHGA.pdf from Zenodo Record 11548617
Processing S3-1_Tom_Boissonnet-Day_To_Day_Image_Data_Management_With_OMERO.pdf from Zenodo Record 11548617
Processing S3-2_Riccardo_Massei-Data_Management_of_High_Content_Screening_Data_Using_OMERO.pdf from Zenodo Record 11548617
Processing S3-3_Marcelo_Zoccoler-Image_Analysis_with_OMERO.pdf from Zenodo Record 11548617
Processing S4-1_Simon_Bekemeier-Electronic-Lab-Notebooks-and-eLabFTW-An-Introduction.pdf from Zenodo Record 11548617
Processing S4-2_Luca_Bertinetti_First_Steps_Towards_Effective_Data_Management_in_a_Multidisciplinary_Research_Group_with_eLabFTW.pdf from Zenodo Record 11548617
Processing S4-3_Marc_Gentzel_eLabFTW_at_a_Facility.pdf from Zenodo Record 11548617
Processing WS_program_and_campus_survey_Conni_Wetzker.pdf from Zenodo Record 11548617


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Finished processing Zenodo Record 11548617.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/12.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1001 [00:00<?, ? examples/s]

Processing 01_Introduction_BIDS_2024.pdf from Zenodo Record 12623730
Processing 02_Introduction_RDM_2024.pdf from Zenodo Record 12623730
Processing 03_RSM_Image_Processing.pdf from Zenodo Record 12623730
Processing 04_Image_segmentation.pdf from Zenodo Record 12623730
Processing 05_Surface_Recon_QA.pdf from Zenodo Record 12623730
Processing 06_Chatbots.pdf from Zenodo Record 12623730
Processing 07_distributed_gpu_computing.pdf from Zenodo Record 12623730
Processing 08_Sup_Unsup_Machine_Learning.pdf from Zenodo Record 12623730
Processing 09_Deep_Learning.pdf from Zenodo Record 12623730
Processing 10_function_calling.pdf from Zenodo Record 12623730
Processing 11_prompteng_rag_finetuning.pdf from Zenodo Record 12623730
Processing 12_Vision_models.pdf from Zenodo Record 12623730
Processing 13_quality_assurance.pdf from Zenodo Record 12623730
Processing 14_Summary.pdf from Zenodo Record 12623730


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Finished processing Zenodo Record 12623730.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/24.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1864 [00:00<?, ? examples/s]

Processing CrashkursFDM_UniLeipzig_20200430.pdf from Zenodo Record 3778431
Processing CrashkursFDM_UniLeipzig_Uebungen_20200430.pdf from Zenodo Record 3778431


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 3778431.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2038 [00:00<?, ? examples/s]

Processing Creating Open Computational Curricula - CZI EOSS Meeting Dec 2020.pdf from Zenodo Record 4317149


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4317149.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2062 [00:00<?, ? examples/s]

Processing 2020-12 QuPath-CZI (Compressed).pdf from Zenodo Record 4328911


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4328911.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2108 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Finished processing Zenodo Record 4330625.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.
Processing Nextflow CZI EOSS 2020.pdf from Zenodo Record 4334697


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4334697.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2135 [00:00<?, ? examples/s]

Processing 210121_SaxFDM_DigitalKitchen_Repositorien.pdf from Zenodo Record 4461261


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4461261.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2156 [00:00<?, ? examples/s]

Processing dmp_workshop_teil1_20210323.pdf from Zenodo Record 4630788


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4630788.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/29.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2207 [00:00<?, ? examples/s]

Processing EinblickeFDM_Recht_UniLeipzig_20210511_v03.pdf from Zenodo Record 4748510


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4748510.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/29.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2232 [00:00<?, ? examples/s]

Processing DMP_Workshop_Teil2_20210330.pdf from Zenodo Record 4748534


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4748534.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2262 [00:00<?, ? examples/s]

Processing intro_rdm_os_srds_short.pdf from Zenodo Record 4778265


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 4778265.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/30.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2308 [00:00<?, ? examples/s]

Processing 202310_GENERAL_OMERO_Material_01_WhatIsOMERO.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_02_ConnectToOMERO.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_03_OMERO_Explained.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_04_UserGroups.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_05_UploadingData.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_06-0_DataOrganization.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_06-1_DataSearch.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_07-0_Metadata.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_07-1_Metadata_Tags.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_07-2_Metadata_KeyValuePairs.pdf from Zenodo Record 8323588
Processing 202310_GENERAL_OMERO_Material_07-3_Metadata_Ontologies.pdf from Zenodo Record 8323588
P

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 8323588.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2540 [00:00<?, ? examples/s]

Processing Thinking_Data_Management_On_Different_Scales.pdf from Zenodo Record 8329305


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 8329305.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2570 [00:00<?, ? examples/s]

Processing Thinking_Data_Management_On_Different_Scales.pdf from Zenodo Record 8329306


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 8329306.
Repository 'haesleinhuepf/SlightInsight_Cache' already exists.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2600 [00:00<?, ? examples/s]

Processing 20231005_PIC_UFZ_NFDI4BIOIMAGE_Massei_public.pdf from Zenodo Record 8414318


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Finished processing Zenodo Record 8414318.


4. Example on how to load the data again for a specific slide from a specific Record/Presentation

In [2]:
from caching import load_single_hf_cache
import pandas as pd

df = load_single_hf_cache("10083555", 3)
df

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Unnamed: 0,key,zenodo_record_id,zenodo_filename,page_number,text_embedding,visual_embedding,mixed_embedding,extracted_text
0,record10083555_pdf1_slide3,10083555,Bio-Image_Data_Strudel_TU-Dresden_TP_Workshop_...,3,"[-0.8083789, -0.57003796, 0.026535584, 0.15640...","[-0.002609865041449666, -0.11352983117103577, ...","[-0.53784376, -0.4322776, 0.20752871, -0.15392...",What is the ‚Nationale Forschungsdaten Infrast...


5. How to load the whole dataset

In [3]:
from caching import load_full_hf_cache

df_full = load_full_hf_cache()    
df_full.head()

Unnamed: 0,key,zenodo_record_id,zenodo_filename,page_number,text_embedding,visual_embedding,mixed_embedding,extracted_text
0,record10008464_pdf1_slide1,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,1,"[0.8054575, -0.4204579, 0.111390926, 0.2636367...","[0.22024887800216675, 0.6564452648162842, 0.02...","[0.22330284, -0.5643485, 0.32713842, -0.074667...","Welcome to\nBioImage Town!\nJosh Moore, Senior..."
1,record10008464_pdf1_slide2,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,2,"[1.1522328, 0.02467385, 0.23145455, 0.17258961...","[-0.3846272826194763, -0.01668522134423256, -0...","[0.34757608, -0.6562839, 0.5335755, -0.1906935...",Special welcome\nHonored New\nGuests Colleague...
2,record10008464_pdf1_slide3,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,3,"[1.0057557, 0.18344171, 0.03796136, 0.23549518...","[-0.32376205921173096, 0.06897055357694626, 0....","[0.06456853, -0.79703176, 0.871621, -0.8768049...",Value of bioimages\nhttps://ome-model.readthed...
3,record10008464_pdf1_slide4,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,4,"[1.2563236, 0.25261688, 0.0040982994, 0.202414...","[-0.03493745997548103, -0.11242333054542542, 0...","[0.3643795, -0.40593308, 0.97395766, -0.495041...",Value of bioimages\nAll-Hands Meeting · Josh M...
4,record10008464_pdf1_slide5,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,5,"[0.8471789, -0.40933934, 0.85754687, -0.496454...","[-0.6693567037582397, -0.02434205450117588, -0...","[-0.22939722, -0.6752343, 0.57570195, 0.046955...","Image Data Management isn’t always FAIR\n""Clar..."
