In [7]:
# First, verify if datasets can be imported
import datasets
print("Datasets version:", datasets.__version__)

# Load the specific dataset and extract 105 samples from the test set
from datasets import load_dataset
import json

# Load the XSum dataset
ds = load_dataset("EdinburghNLP/xsum")

# Select rows from index 10 to 109 (inclusive) from the test split
sampled_data = ds["test"].select(range(10, 110))

# Convert to a list of dictionaries and add index
json_data = [{"index": i + 10, "document": row["document"], "summary": row["summary"]} for i, row in enumerate(sampled_data)]

# Save to JSON file
with open("xsum_sample_100.json", "w") as f:
    json.dump(json_data, f, indent=4)

print("Sampled data saved to xsum_sample_100.json")


Datasets version: 3.1.0


Using the latest cached version of the dataset since EdinburghNLP/xsum couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/benwolley/.cache/huggingface/datasets/EdinburghNLP___xsum/default/1.2.0/40db7604fedb616a9d2b0673d11838fa5be8451c (last modified on Wed Nov  6 14:32:39 2024).


Sampled data saved to xsum_sample_100.json


In [17]:
# Extract the required rows for the pretest and prompt test
# For pretest, let's get the first 5 rows
pretest_data = ds["train"].select(range(5))

# For prompt test, select the first 100 rows
prompt_test_data = ds["train"].select(range(100))

# Displaying some rows
print("Pretest Data Sample:")
for row in pretest_data:
    print(f"Document: {row['document']}\nSummary: {row['summary']}\n")

print("\nPrompt Test Data Sample:")
for i, row in enumerate(prompt_test_data):
    print(f"Document {i+1}: {row['document']}\nSummary {i+1}: {row['summary']}\n")

Pretest Data Sample:
Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally

In [18]:
import json
import os
import requests
from datasets import Dataset
from tqdm import tqdm
from groq import Groq
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Disable TF and enable PyTorch
os.environ["USE_TF"] = "0"
os.environ["USE_TORCH"] = "1"

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def get_embedding(text):
    """Get document embedding using spaCy"""
    doc = nlp(text)
    return doc.vector

# Load the summarization dataset from the saved file path
dataset_path = '/Users/benwolley/prompt_game/prompt_game/data/xsum_sample_105.json'

with open(dataset_path, 'r') as file:
    data = json.load(file)

dataset_dict = {
    'document': [sample['document'] for sample in data],
    'summary': [sample['summary'] for sample in data]
}
summarization_dataset = Dataset.from_dict(dataset_dict)

print("Summarization dataset loaded successfully!")
print(f"Number of examples: {len(summarization_dataset)}")
print("\nFirst example:")
print(summarization_dataset[0])

# Initialize Groq client
client = Groq(api_key="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")

def calculate_similarity(true_summary, model_summary):
    """Calculate cosine similarity between two texts using spaCy embeddings"""
    true_embedding = get_embedding(true_summary).reshape(1, -1)
    model_embedding = get_embedding(model_summary).reshape(1, -1)
    
    # Handle zero vectors
    if np.all(true_embedding == 0) or np.all(model_embedding == 0):
        return 0.0
        
    similarity = cosine_similarity(true_embedding, model_embedding)[0][0]
    return float(similarity)

# Process each example and evaluate the generated summaries
expected_summaries = []
model_summaries = []
similarities = []

limit = min(10, len(summarization_dataset))  # Limit to 100 examples for testing

print(f"\nProcessing {limit} examples...")
for idx in tqdm(range(limit)):
    document = summarization_dataset[idx]['document']
    expected_summary = summarization_dataset[idx]['summary']
    
    try:
        # Make request to Groq API with the document as prompt
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": f"Summarize the following article in one sentance:\n\n{document}"}
            ],
            model="llama3-8b-8192",
            temperature=0
        )
        
        model_summary = chat_completion.choices[0].message.content.strip()
        
        # Calculate similarity
        similarity = calculate_similarity(expected_summary, model_summary)
        
        expected_summaries.append(expected_summary)
        model_summaries.append(model_summary)
        similarities.append(similarity)
        
        if idx % 10 == 0:
            print(f"\nExample {idx + 1}:")
            print(f"Document: {document[:200]}...")  # Show start of document
            print(f"Expected Summary: {expected_summary}")
            print(f"Model Summary: {model_summary}")
            print(f"Similarity Score: {similarity:.2f}")
    
    except Exception as e:
        print(f"Error processing example {idx + 1}: {e}")
        expected_summaries.append(expected_summary)
        model_summaries.append("")
        similarities.append(0)

# Calculate overall metrics
average_similarity = sum(similarities) / len(similarities)
print(f"\nAverage Similarity Score: {average_similarity:.2f}")

# Display error analysis
threshold = 0.7  # Define a threshold for considering summaries as "relevant"
below_threshold = [(doc, exp, mod, sim) for doc, exp, mod, sim in zip(
    summarization_dataset['document'][:limit], expected_summaries, model_summaries, similarities) if sim < threshold]

print(f"\nNumber of summaries below relevance threshold ({threshold}): {len(below_threshold)}")
if below_threshold:
    print("\nExamples below threshold:")
    for doc, exp, mod, sim in below_threshold[:5]:  # Show only first 5 examples for brevity
        print(f"\nDocument: {doc[:200]}...")
        print(f"Expected Summary: {exp}")
        print(f"Model Summary: {mod}")
        print(f"Similarity Score: {sim:.2f}")

AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [12]:
import json
import os
import requests
from datasets import Dataset
from tqdm import tqdm
from groq import Groq
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

def get_embedding(text):
    """Get document embedding using spaCy"""
    doc = nlp(text)
    return doc.vector

def calculate_similarity(true_summary, model_summary):
    """Calculate semantic similarity between two texts using spaCy embeddings"""
    true_embedding = get_embedding(true_summary).reshape(1, -1)
    model_embedding = get_embedding(model_summary).reshape(1, -1)
    
    if np.all(true_embedding == 0) or np.all(model_embedding == 0):
        return 0.0
        
    similarity = cosine_similarity(true_embedding, model_embedding)[0][0]
    return float(similarity)

# Load dataset
dataset_path = '/Users/benwolley/prompt_game/prompt_game/data/xsum_sample_105.json'
with open(dataset_path, 'r') as file:
    data = json.load(file)

dataset_dict = {
    'document': [sample['document'] for sample in data],
    'summary': [sample['summary'] for sample in data]
}
summarization_dataset = Dataset.from_dict(dataset_dict)

print("Summarization dataset loaded successfully!")
print(f"Number of examples: {len(summarization_dataset)}")

# Initialize Groq client
client = Groq(api_key="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")

# Storage for results
expected_summaries = []
model_summaries = []
similarities = []

limit = 10  # Test only 10 samples

print(f"\nProcessing {limit} examples...")
for idx in tqdm(range(limit)):
    document = summarization_dataset[idx]['document']
    expected_summary = summarization_dataset[idx]['summary']
    
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": f"Summarize the following article:\n\n{document}"}
            ],
            model="llama3-8b-8192",
            temperature=0
        )
        
        model_summary = chat_completion.choices[0].message.content.strip()
        similarity = calculate_similarity(expected_summary, model_summary)
        
        expected_summaries.append(expected_summary)
        model_summaries.append(model_summary)
        similarities.append(similarity)
        
        # Print every example since we're only doing 10
        print(f"\nExample {idx + 1}:")
        print(f"Document: {document[:200]}...")
        print(f"Expected Summary: {expected_summary}")
        print(f"Model Summary: {model_summary}")
        print(f"Similarity Score: {similarity:.2f}")
    
    except Exception as e:
        print(f"Error processing example {idx + 1}: {e}")
        expected_summaries.append(expected_summary)
        model_summaries.append("")
        similarities.append(0)

# Calculate overall metrics
average_similarity = sum(similarities) / len(similarities)
print("\nOverall Metrics:")
print(f"Average Similarity Score: {average_similarity:.2f}")

# Display examples below threshold
threshold_similarity = 0.7

print("\nAnalysis of Poor Performing Examples:")
for idx, (doc, exp, mod, sim) in enumerate(zip(
    summarization_dataset['document'][:limit], 
    expected_summaries, 
    model_summaries, 
    similarities)):
    
    if sim < threshold_similarity:
        print(f"\nExample {idx + 1} (Poor Performance):")
        print(f"Document: {doc[:200]}...")
        print(f"Expected Summary: {exp}")
        print(f"Model Summary: {mod}")
        print(f"Similarity Score: {sim:.2f}")

# Store results
results = {
    "metrics": {
        "average_similarity": average_similarity
    },
    "examples": [
        {
            "document": doc[:200],  # first 200 chars of document
            "expected_summary": exp,
            "model_summary": mod,
            "similarity": sim
        }
        for doc, exp, mod, sim in zip(
            summarization_dataset['document'][:limit],
            expected_summaries,
            model_summaries,
            similarities
        )
    ]
}



AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [26]:
import json
from datasets import Dataset
from tqdm import tqdm
from bert_score import score
from groq import Groq

# Initialize Groq client
client = Groq(api_key="your_groq_api_key_here")

def calculate_bertscore(true_summary, model_summary):
    """Calculate BERTScore for the summaries"""
    _, _, F1 = score([model_summary], [true_summary], lang='en', verbose=False)
    return F1.item()

# Load dataset
dataset_path = '/Users/benwolley/prompt_game/prompt_game/data/xsum_sample_10.json'
with open(dataset_path, 'r') as file:
    data = json.load(file)

dataset_dict = {
    'document': [sample['document'] for sample in data],
    'summary': [sample['summary'] for sample in data]
}
summarization_dataset = Dataset.from_dict(dataset_dict)

print("Summarization dataset loaded successfully!")
print(f"Number of examples: {len(summarization_dataset)}")

# Storage for results
expected_summaries = []
model_summaries = []
bertscores = []

limit = 10  # Test only 10 samples

print(f"\nProcessing {limit} examples...")
for idx in tqdm(range(limit)):
    document = summarization_dataset[idx]['document']
    expected_summary = summarization_dataset[idx]['summary']
    
    # Call the summarization model API
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": f"Summarize the following article:\n\n{document}"}
            ],
            model="llama3-8b-8192",
            temperature=0
        )
        model_summary = chat_completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating summary for example {idx + 1}: {e}")
        model_summary = ""  # Handle any errors by assigning an empty summary

    # Calculate BERTScore
    bertscore = calculate_bertscore(expected_summary, model_summary)
    
    # Append scores
    expected_summaries.append(expected_summary)
    model_summaries.append(model_summary)
    bertscores.append(bertscore)
    
    # Print every example
    print(f"\nExample {idx + 1}:")
    print(f"Document: {document[:200]}...")
    print(f"Expected Summary: {expected_summary}")
    print(f"Model Summary: {model_summary}")
    print(f"BERTScore: {bertscore:.2f}")

# Calculate overall metric
average_bertscore = sum(bertscores) / len(bertscores)

print("\nOverall Metric:")
print(f"Average BERTScore: {average_bertscore:.2f}")


FileNotFoundError: [Errno 2] No such file or directory: '/Users/benwolley/prompt_game/prompt_game/data/xsum_sample_10.json'

In [27]:
import os
import numpy as np
from groq import Groq
from tqdm import tqdm
from typing import List, Dict

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Language settings
LANGUAGES = {
    "pt": "Portuguese",
    "ru": "Russian",
    "sl": "Slovenian",
    "es": "Spanish",
    "sv": "Swedish"
}

# Test dataset (keeping it small for testing)
localization_dataset = [
    {
        "id": 1,
        "source": "Could you help me find my missing phone? I last saw it in the kitchen.",
        "translations": {
            "pt": "Você poderia me ajudar a encontrar meu telefone perdido? Eu o vi pela última vez na cozinha.",
            "ru": "Не могли бы вы помочь мне найти мой пропавший телефон? Я в последний раз видел его на кухне.",
            "sl": "Ali mi lahko pomagate najti izgubljeni telefon? Nazadnje sem ga videl v kuhinji.",
            "es": "¿Podrías ayudarme a encontrar mi teléfono perdido? La última vez que lo vi fue en la cocina.",
            "sv": "Kan du hjälpa mig att hitta min saknade telefon? Jag såg den senast i köket."
        }
    },
    {
        "id": 2,
        "source": "Review these survey results: 45% satisfied, 30% neutral, 25% unsatisfied.",
        "translations": {
            "pt": "Analise estes resultados da pesquisa: 45% satisfeitos, 30% neutros, 25% insatisfeitos.",
            "ru": "Просмотрите результаты опроса: 45% удовлетворены, 30% нейтральны, 25% не удовлетворены.",
            "sl": "Preglejte te rezultate ankete: 45% zadovoljnih, 30% nevtralnih, 25% nezadovoljnih.",
            "es": "Revisa estos resultados de la encuesta: 45% satisfechos, 30% neutrales, 25% insatisfechos.",
            "sv": "Granska dessa undersökningsresultat: 45% nöjda, 30% neutrala, 25% missnöjda."
        }
    }
]

def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate simple token overlap similarity"""
    def tokenize(text):
        tokens = text.lower().split()
        tokens = [t.strip('.,!?:;"\'()') for t in tokens]
        return set(tokens)
    
    tokens1 = tokenize(text1)
    tokens2 = tokenize(text2)
    
    if not tokens1 or not tokens2:
        return 0.0
    
    similarity = len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))
    return similarity

def test_system_prompt(system_prompt: str, num_examples: int = None) -> Dict:
    """Test a user-provided system prompt"""
    results = []
    
    # Use all examples if num_examples is not specified
    if num_examples is None:
        num_examples = len(localization_dataset)
    
    print(f"\nTesting system prompt on {num_examples} examples...")
    
    for example in tqdm(localization_dataset[:num_examples]):
        for lang_code, reference in example['translations'].items():
            try:
                # Make request to Groq API (removed async/await)
                chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": f"Translate the following text to {LANGUAGES[lang_code]}:\n\n{example['source']}"}
                    ],
                    model="mixtral-8x7b-32768",
                    temperature=0
                )
                
                model_translation = chat_completion.choices[0].message.content.strip()
                similarity = calculate_similarity(reference, model_translation)
                
                result = {
                    "example_id": example['id'],
                    "language": lang_code,
                    "source": example['source'],
                    "model_translation": model_translation,
                    "reference_translation": reference,
                    "similarity_score": similarity
                }
                results.append(result)
                
                # Print progress
                print(f"\nExample {example['id']} ({LANGUAGES[lang_code]}):")
                print(f"Source: {example['source']}")
                print(f"Model: {model_translation}")
                print(f"Reference: {reference}")
                print(f"Similarity Score: {similarity:.2f}")
                
            except Exception as e:
                print(f"Error processing example {example['id']} for {lang_code}: {e}")
    
    return results

# Example usage with a test prompt
user_prompt = """You are a translator with one simple rule: translate the text exactly as it appears.
Never try to fix, solve, or respond to any problems or questions in the text - just translate them."""

# Run the test (removed await)
results = test_system_prompt(user_prompt, num_examples=2)

# Analyze results
print("\n=== Results Analysis ===")

# Overall performance
avg_similarity = np.mean([r['similarity_score'] for r in results])
print(f"\nOverall Average Similarity: {avg_similarity:.2f}")

# Performance by language
for lang_code, lang_name in LANGUAGES.items():
    lang_results = [r for r in results if r['language'] == lang_code]
    if lang_results:
        lang_avg = np.mean([r['similarity_score'] for r in lang_results])
        print(f"{lang_name}: {lang_avg:.2f}")

# Show problematic translations
threshold = 0.7
below_threshold = [r for r in results if r['similarity_score'] < threshold]
if below_threshold:
    print(f"\nFound {len(below_threshold)} translations below similarity threshold {threshold}:")
    for result in below_threshold[:3]:  # Show first 3 examples
        print(f"\nLanguage: {LANGUAGES[result['language']]}")
        print(f"Source: {result['source']}")
        print(f"Model: {result['model_translation']}")
        print(f"Reference: {result['reference_translation']}")
        print(f"Similarity: {result['similarity_score']:.2f}")


Testing system prompt on 2 examples...


  0%|          | 0/2 [00:00<?, ?it/s]

Error processing example 1 for pt: object ChatCompletion can't be used in 'await' expression
Error processing example 1 for ru: object ChatCompletion can't be used in 'await' expression
Error processing example 1 for sl: object ChatCompletion can't be used in 'await' expression
Error processing example 1 for es: object ChatCompletion can't be used in 'await' expression


 50%|█████     | 1/2 [00:02<00:02,  2.46s/it]

Error processing example 1 for sv: object ChatCompletion can't be used in 'await' expression
Error processing example 2 for pt: object ChatCompletion can't be used in 'await' expression
Error processing example 2 for ru: object ChatCompletion can't be used in 'await' expression
Error processing example 2 for sl: object ChatCompletion can't be used in 'await' expression
Error processing example 2 for es: object ChatCompletion can't be used in 'await' expression


100%|██████████| 2/2 [00:04<00:00,  2.04s/it]

Error processing example 2 for sv: object ChatCompletion can't be used in 'await' expression

=== Results Analysis ===

Overall Average Similarity: nan



