### Notebook to check the Licenses of all Zenodo Records in our Cache and upload a ReadMe to Huggingface with the Corresponding Licencse Terms

In [5]:
import requests

def get_zenodo_license(record_id):
    url = f"https://zenodo.org/api/records/{record_id}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        license_info = data.get("metadata", {}).get("license", {})
        
        if license_info:
            license_id = license_info.get("id", "No ID available")
            return license_id
        else:
            return "No license information found."
    else:
        return f"Error: Unable to fetch data (Status Code: {response.status_code})"


## Load the Data
Load the DataSet from Huggingface and convert it to a pandas data frame (automatically done by the function).

In [1]:
from caching import load_full_hf_cache
import pandas as pd

repo_name = "ScaDS-AI/SlightInsight_Cache"

df = load_full_hf_cache(repo_name=repo_name)

Using the latest cached version of the dataset since ScaDS-AI/SlightInsight_Cache couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/lea/.cache/huggingface/datasets/ScaDS-AI___slight_insight_cache/default/0.0.0/a402c19f8ff75a5f2f671c647710c447bd15a55b (last modified on Thu Mar  6 15:58:01 2025).


In [2]:
df.head()

Unnamed: 0,key,zenodo_record_id,zenodo_filename,page_number,text,visual,mixed
0,record10008464_pdf1_slide1,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,1,"[0.8054575, -0.4204579, 0.111390926, 0.2636367...","[0.22024887800216675, 0.6564452648162842, 0.02...","[0.22330284, -0.5643485, 0.32713842, -0.074667..."
1,record10008464_pdf1_slide2,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,2,"[1.1522328, 0.02467385, 0.23145455, 0.17258961...","[-0.3846272826194763, -0.01668522134423256, -0...","[0.34757608, -0.6562839, 0.5335755, -0.1906935..."
2,record10008464_pdf1_slide3,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,3,"[1.0057557, 0.18344171, 0.03796136, 0.23549518...","[-0.32376205921173096, 0.06897055357694626, 0....","[0.06456853, -0.79703176, 0.871621, -0.8768049..."
3,record10008464_pdf1_slide4,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,4,"[1.2563236, 0.25261688, 0.0040982994, 0.202414...","[-0.03493745997548103, -0.11242333054542542, 0...","[0.3643795, -0.40593308, 0.97395766, -0.495041..."
4,record10008464_pdf1_slide5,10008464,2023-Moore-N4BI-AHM-Welcome.pdf,5,"[0.8471789, -0.40933934, 0.85754687, -0.496454...","[-0.6693567037582397, -0.02434205450117588, -0...","[-0.22939722, -0.6752343, 0.57570195, 0.046955..."


## Extract all unique IDs

In [3]:
unique_zenodo_ids = df["zenodo_record_id"].unique()
print(unique_zenodo_ids)

['10008464' '10008465' '10083555' '10679054' '10687658' '10815329'
 '10816895' '10886749' '10939519' '10942559' '10970869' '10972692'
 '10990107' '11031746' '11066250' '11107798' '11265038' '11396199'
 '11472148' '11474407' '11548617' '12623730' '3778431' '4317149' '4328911'
 '4334697' '4461261' '4630788' '4748510' '4748534' '4778265' '8323588'
 '8329305' '8329306' '8414318']


Create a Set to ensure that all entries have the same license

## Now create the ReadMe file with the following information:
    - Links to the original zenodo records
    - Authors
    - Licenses of the original records.

First, gather information for each record:

In [6]:
records_info = []

for record in unique_zenodo_ids:
    
    url = f"https://zenodo.org/api/records/{record}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        authors = ", ".join([creator.get("name", "Unknown Author") for creator in data.get("metadata", {}).get("creators", [])])           
        record_url = data.get("links", {}).get("html", f"https://zenodo.org/record/{record}")
    else:
        print(f"Error: Unable to fetch data (Status Code: {response.status_code})")
    license = get_zenodo_license(record)
    
    records_info.append(f"- **[Zenodo Record {record}]({record_url})**  \n  **Authors**: {authors}  \n  **License**: {license}\n")

Second, combine it with some more general information (in Markdown Style)

In [7]:
readme_content = (
    "# About this Dataset\n\n"
    "This Dataset contains data from several Presentation Slides. For each Slide the following information is available:\n\n"
    "- key: recordID_pdfNumber_slideNumber\n\n"
    "- zenodo_record_id : record ID\n\n"
    "- zenodo_filename: PDF Filename\n\n"
    "- text: Text Embedding (using the mixedbread-ai/mxbai-embed-large-v1 model)\n\n"
    "- visual: Vision Embedding (using the openai/clip-vit-base-patch32 model)\n\n"
    "- mixed: Mixed Embedding (text embedding of a generated structured response that describes the Slide as an Image, using gpt-4o model)\n\n"
    "\n"
    "# Zenodo Records Information\n\n"
    "This repository contains data from Zenodo records.\n\n"
    "## Records\n\n" +
    "\n".join(records_info)
)

Third, upload the combined information as a valid Markdown file to the Huggingface Repository

In [8]:
with open("HUGGINGFACE_README.md", "w", encoding="utf-8") as f:
    f.write(readme_content)

In [9]:
from huggingface_hub import HfApi
import io

api = HfApi()
readme_file = io.BytesIO(readme_content.encode("utf-8"))

try:
    api.upload_file(
        path_or_fileobj=readme_file,
        path_in_repo="README.md",  
        repo_id=repo_name,
        repo_type="dataset",
        create_pr = True 
        )
    print(f"README.md successfully uploaded to {repo_id}!")
except Exception as e:
    print(f"Error uploading README.md: {e}")

Error uploading README.md: 404 Client Error. (Request ID: Root=1-67d9699f-23d06f5e55a8647f341e84fc;313fb058-b5c0-4823-b19f-38ec139417e1)

Repository Not Found for url: https://huggingface.co/api/datasets/ScaDS-AI/SlightInsight_Cache/preupload/main?create_pr=1.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.


- empty or missing yaml metadata in repo card
