In [15]:
# Import required libraries

from datasets import load_dataset
from dotenv import load_dotenv
from huggingface_hub import HfApi, HfFolder
import markdown
import os
import pandas as pd
import requests
import subprocess
import transformers

In [16]:
# Log in to Hugging Face

load_dotenv()

hf_token = os.getenv('HF_TOKEN')
HfFolder.save_token(hf_token)


result = subprocess.run(["huggingface-cli", "login", "--token", hf_token], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if result.returncode == 0:
    print("Successfully logged in to Hugging Face.")
else:
    print("Failed to log in to Hugging Face.")
    print(result.stderr.decode())

Successfully logged in to Hugging Face.


In [17]:
# Collect and print MPEP datasets

api = HfApi()

def search_datasets_by_label(label):
    datasets = api.list_datasets(search=label)
    return datasets

mpep_datasets = search_datasets_by_label('MPEP')

# Extract and print the first entry's value parameter from the target JSON object and external_id for each dataset
for dataset in mpep_datasets:
    ds = load_dataset(dataset.id)
    target_entries = ds['train']['target'][:1]
    external_ids = ds['train']['external_id'][:1]
    print(f"Dataset: {dataset.id}")
    if target_entries:
        first_target = target_entries[0]
        if isinstance(first_target, list) and first_target:
            value = first_target[0].get('value', 'No value found')
            print(f"External ID: {external_ids[0]}")
            print(f"Value: {value}")
        elif isinstance(first_target, dict):
            value = first_target.get('value', 'No value found')
            print(f"External ID: {external_ids[0]}")
            print(f"Value: {value}")
        else:
            print("No target value found")
    else:
        print("No target entries found")
    print("\n" + "-"*40 + "\n")


Dataset: DIBT/MPEP_DUTCH
External ID: 1788
Value: Als een AI-enthousiasteling, houd je ervan om programma's te maken die de menselijke taal begrijpen. Je nieuwste project is een programma dat woorden kan herkennen en vervangen door hun antoniemen in een stuk tekst.
Om de effectiviteit van je programma aan te tonen, besluit je het te testen op een nieuwsartikel over een recent politiek evenement. Om het uitdagender te maken, wil je ook dat je programma onderscheid maakt tussen homoniemen, en daarnaast contextuele aanwijzingen gebruikt woorden correct te vervangen.
Hier is een stapsgewijze uitleg van hoe je programma werkt:
1. Het programma leest de inputtekst en identificeert alle woorden die antoniemen hebben.
2. Voor elk van deze woorden identificeert het programma de context waarin het verschijnt om het juiste antoniem te bepalen.
3. Het programma vervangt vervolgens het originele woord door zijn antoniem in de tekst.
4. Als het originele woord meerdere betekenissen heeft, gebruikt h

In [18]:
def query_hf_inference(model, prompt, token):
    api_url = f"https://api-inference.huggingface.co/models/{model}"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        response = requests.post(api_url, headers=headers, json={"inputs": prompt}, timeout=30)
        # Check for valid response
        if response.status_code == 200:
            try:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    return result[0].get('generated_text', 'No generated text found')
                else:
                    return "Unexpected response format"
            except ValueError:
                return "Invalid JSON response"
        else:
            return f"Error {response.status_code}: {response.text}"
    except requests.exceptions.Timeout:
        return "Request timed out"


# List of target models
models = [
    "google/gemma-2-27b-it",
    "meta-llama/Meta-Llama-3-70B-Instruct",
    "Qwen/Qwen2-72B-Instruct",
    "mistralai/Mixtral-8x22B-Instruct-v0.1",
]

# Number of prompts to process
num_prompts = 5

In [19]:
def search_datasets_by_label(label):
    # Search datasets with the specified label
    datasets = api.list_datasets(search=label)
    return datasets

# Search for datasets with the "MPEP" label
mpep_datasets = search_datasets_by_label('MPEP')

# Directories to save responses and cache
os.makedirs('./responses', exist_ok=True)
os.makedirs('./cache', exist_ok=True)

# Load existing cache files
processed_external_ids = {}
for cache_file in os.listdir('./cache'):
    if cache_file.endswith('.txt'):
        language = cache_file.replace('.txt', '')
        with open(f'./cache/{cache_file}', 'r') as f:
            processed_external_ids[language] = set(f.read().splitlines())

# Process each dataset
for dataset in mpep_datasets:
    # Load the dataset
    ds = load_dataset(dataset.id)
    # Check if the dataset has 500 or more prompts
    if len(ds['train']) < 500:
        print(f"Skipping dataset {dataset.id} as it has less than 500 prompts.")
        continue
    
    # Extract the first 'num_prompts' entries from the target JSON object
    target_entries = ds['train']['target'][:num_prompts]
    external_ids = ds['train']['external_id'][:num_prompts]
    
    # Get the language from the dataset id (assuming the format is consistent)
    language = dataset.id.split('_')[-1].lower()
    
    # Initialize the cache set for the language if not already done
    if language not in processed_external_ids:
        processed_external_ids[language] = set()
    
    # Open a file to write the responses
    with open(f'./responses/{language}.md', 'w') as file:
        # Capture responses for each model
        for model in models:
            file.write(f"# Model: {model}\n\n")
            for i, target_entry in enumerate(target_entries):
                external_id = external_ids[i]
                # Check if this external_id has already been processed
                if external_id in processed_external_ids[language]:
                    print(f"Skipping prompt with external_id: {external_id} as it is already processed.")
                    continue
                
                if isinstance(target_entry, list) and target_entry:
                    prompt = target_entry[0].get('value', 'No value found')
                elif isinstance(target_entry, dict):
                    prompt = target_entry.get('value', 'No value found')
                else:
                    prompt = "No prompt found"
                
                response_text = query_hf_inference(model, prompt, os.getenv('HF_TOKEN'))
                
                file.write(f"## Prompt:\n{prompt}\n\n")
                file.write(f"## Response:\n{response_text}\n\n")
                file.write("\n" + "-"*40 + "\n\n")
                
                # Add the processed external_id to the set
                processed_external_ids[language].add(external_id)
    
    # Write all processed external_ids for this language to the cache file
    with open(f'./cache/{language}.txt', 'w') as cache_file:
        for external_id in processed_external_ids[language]:
            cache_file.write(f"{external_id}\n")

print("Script completed successfully.")

Skipping prompt with external_id: 1788 as it is already processed.
Skipping prompt with external_id: 1752 as it is already processed.
Skipping prompt with external_id: 1162 as it is already processed.
Skipping prompt with external_id: 765 as it is already processed.
Skipping prompt with external_id: 964 as it is already processed.
Skipping prompt with external_id: 1788 as it is already processed.
Skipping prompt with external_id: 1752 as it is already processed.
Skipping prompt with external_id: 1162 as it is already processed.
Skipping prompt with external_id: 765 as it is already processed.
Skipping prompt with external_id: 964 as it is already processed.
Skipping prompt with external_id: 1788 as it is already processed.
Skipping prompt with external_id: 1752 as it is already processed.
Skipping prompt with external_id: 1162 as it is already processed.
Skipping prompt with external_id: 765 as it is already processed.
Skipping prompt with external_id: 964 as it is already processed.
S