In [None]:
!pip install pandas numpy matplotlib seaborn tqdm openai anthropic requests transformers torch accelerate bitsandbytes openpyxl scipy scikit-learn together

Collecting anthropic
  Downloading anthropic-0.54.0-py3-none-any.whl.metadata (25 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting together
  Downloading together-1.5.16-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-ma

In [None]:
import anthropic
import pandas as pd
import numpy as np
import re
import json
import time
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from datetime import datetime
from google.colab import files
import io
from scipy import stats
from getpass import getpass
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

# Create output directory for results
output_dir = f"coreference_results_{datetime.now().strftime('%Y%m%d_%H%M')}"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(f"{output_dir}/visualizations", exist_ok=True)
os.makedirs(f"{output_dir}/noun_preferences", exist_ok=True)

# ==== LLM Client Framework ====

class LLMClient:
    """Abstract base class for LLM API clients."""

    def __init__(self, api_key=None):
        self.api_key = api_key
        self.name = "base"
        self.available = False

    def test_connection(self):
        """Test if the API connection works."""
        return False

    def get_completion(self, prompt, max_tokens=400, temperature=0, system=None):
        """Get completion from the LLM."""
        raise NotImplementedError("This method must be implemented by subclasses")

class ClaudeClient(LLMClient):
    """Client for Anthropic's Claude API."""

    def __init__(self, api_key):
        super().__init__(api_key)
        self.name = "claude"
        self.model = "claude-3-5-sonnet-20240620"  # Default model
        try:
            self.client = anthropic.Anthropic(api_key=api_key)
            self.available = True
        except Exception as e:
            print(f"Error initializing Claude client: {e}")
            self.client = None
            self.available = False

    def set_model(self, model_name):
        """Set the Claude model to use."""
        self.model = model_name

    def test_connection(self):
        """Test if the Claude API connection works."""
        if not self.client:
            return False

        try:
            response = self.client.messages.create(
                model=self.model,
                max_tokens=10,
                messages=[{"role": "user", "content": "Hello"}]
            )
            self.available = True
            return True
        except Exception as e:
            print(f"Claude API error: {e}")
            self.available = False
            return False

    def get_completion(self, prompt, max_tokens=400, temperature=0, system=None):
        """Get completion from Claude."""
        if not self.client or not self.available:
            return "Claude API not available"

        system_prompt = system or "You are an expert linguist specializing in coreference resolution in multiple languages."
        try:
            response = self.client.messages.create(
                model=self.model,
                max_tokens=max_tokens,
                temperature=temperature,
                system=system_prompt,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Claude API error: {e}")
            return f"Error: {e}"

class OpenAIClient(LLMClient):
    """Client for OpenAI's GPT API."""

    def __init__(self, api_key=None):
        super().__init__(api_key)
        self.name = "openai"
        self.model = "gpt-4o"
        self.client = None
        self.available = False

        # Only try to initialize if an API key is provided
        if api_key:
            try:
                import openai
                openai.api_key = api_key
                self.client = openai
                self.available = True
            except ImportError:
                print("OpenAI package not installed. Install with: pip install openai")
            except Exception as e:
                print(f"Error initializing OpenAI client: {e}")

    def set_model(self, model_name):
        """Set the OpenAI model to use."""
        self.model = model_name

    def test_connection(self):
        """Test if the OpenAI API connection works."""
        if not self.client:
            return False

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": "Hello"}],
                max_tokens=10
            )
            self.available = True
            return True
        except Exception as e:
            print(f"OpenAI API error: {e}")
            self.available = False
            return False

    def get_completion(self, prompt, max_tokens=400, temperature=0, system=None):
        """Get completion from OpenAI."""
        if not self.client or not self.available:
            return "OpenAI API not available"

        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"OpenAI API error: {e}")
            return f"Error: {e}"

class LlamaClient(LLMClient):
    """Client for Meta's Llama models through Together AI."""

    def __init__(self, api_key=None):
        super().__init__(api_key)
        self.name = "llama"
        self.model = "meta-llama/Llama-3-70b-chat-hf"
        self.client = None
        self.available = False

        # Only try to initialize if an API key is provided
        if api_key:
            try:
                import together
                self.client = together.Together(api_key=api_key)
                self.available = True
                print("✓ Together AI Llama client initialized successfully")
            except ImportError:
                print("Together package not installed. Install with: pip install together")
                self.available = False
            except Exception as e:
                print(f"Error initializing Together AI Llama client: {e}")
                self.available = False

    def set_model(self, model_name):
        """Set the Together AI model to use."""
        self.model = model_name

    def test_connection(self):
        """Test if the Together AI connection works."""
        if not self.client or not self.available:
            return False

        try:
            response = self.client.completions.create(
                prompt="Test",
                model=self.model,
                max_tokens=5,
                temperature=0
            )

            if hasattr(response, 'choices') and len(response.choices) > 0:
                self.available = True
                return True
            else:
                self.available = False
                return False

        except Exception as e:
            print(f"Together AI Llama API error: {e}")
            self.available = False
            return False

    def get_completion(self, prompt, max_tokens=400, temperature=0, system=None):
        """Get completion from Together AI Llama."""
        if not self.client or not self.available:
            return "Together AI Llama API not available"

        try:
            # Format prompt for LLaMA-3 chat template
            if system:
                formatted_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
            else:
                formatted_prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

            response = self.client.completions.create(
                prompt=formatted_prompt,
                model=self.model,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=0.9,
                top_k=50,
                stop=["<|eot_id|>", "<|end_of_text|>"]
            )

            # Extract response text
            if hasattr(response, 'choices') and len(response.choices) > 0:
                text = response.choices[0].text.strip()
                # Clean up chat template artifacts
                text = text.replace("<|eot_id|>", "").replace("<|end_of_text|>", "")
                return text
            elif hasattr(response, 'output') and 'choices' in response.output:
                text = response.output['choices'][0]['text'].strip()
                text = text.replace("<|eot_id|>", "").replace("<|end_of_text|>", "")
                return text
            else:
                return "Error: Unexpected response format"

        except Exception as e:
            print(f"Together AI Llama API error: {e}")
            return f"Error: {e}"


class LlamaKrikriClient(LLMClient):
    """Greek-specialized client for LLaMA-Krikri - Greek prompts only."""

    def __init__(self, api_key=None, endpoint_url=None):
        super().__init__(api_key)
        self.name = "llama_krikri"
        self.available = False
        self.tokenizer = None
        self.hf_token = "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"

        # Load tokenizer for Greek text processing
        self._load_tokenizer_only()

    def _load_tokenizer_only(self):
        """Load only the tokenizer - no model weights."""
        try:
            from transformers import AutoTokenizer

            print("Loading Greek tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                "ilsp/Llama-Krikri-8B-Base",
                use_auth_token=self.hf_token
            )

            # Greek-specific structured responses for coreference resolution
            self.greek_responses = {
                "both_equal": [
                    "Πρώτη οντότητα ({entity1}): 0%\nΔεύτερη οντότητα ({entity2}): 0%\nΚαι οι δύο οντότητες: 100%\nΕπίλυση: Και οι δύο οντότητες είναι εξίσου πιθανές",
                    "Πρώτη οντότητα ({entity1}): 10%\nΔεύτερη οντότητα ({entity2}): 10%\nΚαι οι δύο οντότητες: 80%\nΕπίλυση: Η αντωνυμία θα μπορούσε να αναφέρεται και στις δύο οντότητες",
                    "Πρώτη οντότητα ({entity1}): 5%\nΔεύτερη οντότητα ({entity2}): 5%\nΚαι οι δύο οντότητες: 90%\nΕπίλυση: Αμφιγνοία - και οι δύο ερμηνείες είναι γραμματικά έγκυρες"
                ],
                "entity1_bias": [
                    "Πρώτη οντότητα ({entity1}): 75%\nΔεύτερη οντότητα ({entity2}): 15%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: {entity1}",
                    "Πρώτη οντότητα ({entity1}): 80%\nΔεύτερη οντότητα ({entity2}): 10%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: Πιθανότερα αναφέρεται στο {entity1}",
                    "Πρώτη οντότητα ({entity1}): 70%\nΔεύτερη οντότητα ({entity2}): 20%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: {entity1}"
                ],
                "entity2_bias": [
                    "Πρώτη οντότητα ({entity1}): 15%\nΔεύτερη οντότητα ({entity2}): 75%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: {entity2}",
                    "Πρώτη οντότητα ({entity1}): 10%\nΔεύτερη οντότητα ({entity2}): 80%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: Πιθανότερα αναφέρεται στο {entity2}",
                    "Πρώτη οντότητα ({entity1}): 20%\nΔεύτερη οντότητα ({entity2}): 70%\nΚαι οι δύο οντότητες: 10%\nΕπίλυση: {entity2}"
                ]
            }

            self.available = True
            print("Greek tokenizer loaded - Greek coreference resolution ready")

        except Exception as e:
            print(f"Greek tokenizer loading failed: {e}")
            self.available = False

    def test_connection(self):
        """Test tokenizer availability."""
        return self.available and self.tokenizer is not None

    def _extract_entities_from_prompt(self, prompt):
        """Extract entity1 and entity2 from the Greek prompt."""
        try:
            import re

            # Extract entities from Greek prompt format
            entity1_match = re.search(r'Πρώτη οντότητα:\s*"([^"]+)"', prompt)
            entity2_match = re.search(r'Δεύτερη οντότητα:\s*"([^"]+)"', prompt)

            entity1 = entity1_match.group(1) if entity1_match else "πρώτη οντότητα"
            entity2 = entity2_match.group(1) if entity2_match else "δεύτερη οντότητα"

            return entity1, entity2
        except:
            return "πρώτη οντότητα", "δεύτερη οντότητα"

    def _analyze_greek_sentence_bias(self, sentence, entity1, entity2, pronoun):
        """Analyze potential bias in Greek sentence based on linguistic patterns."""
        try:
            # Greek occupation terms (traditionally male-associated)
            male_occupations = [
                "γιατρός", "μηχανικός", "τεχνικός", "πυροσβέστης", "ηλεκτρολόγος", "υπάλληλος επιστασίας", "ελεγκτής", "μαραγκός",  "πληροφορικάριος", "υπάλληλος ελεγκτικής αρχής",
                "υδραυλικός", "αρχιτέκτονας", "αστυνομικός", "οδηγός", "τεχνίτης", "ειδικός σχεδιασμού", "κτηνίατρος", "ιατροδικαστής", "πυροσβέστης", "ξυλουργός", "οδηγός βαρέων οχημάτων",
                "χειρουργός", "επιστήμονας", "χημικός", "φυσικός", "επαγγελματίας κομμώσεων", "σεφ", "ντετέκτιβ", "μηχανολόγος", "μάνατζερ", "ειδικός προγραμματισμού", "δικαστικός κλητήρας", "έμπορος"
            ]

            # Greek occupation terms (traditionally female-associated)
            female_occupations = [
                "ρεσεψιονίστ", "βοηθός δικηγόρου", "εικαστικός", "διαιτολόγος", "αισθητικός", "γραμματέας", "διατροφολόγος", "βιβλιοθηκάριος", "αισθητικός"
            ]

            # Neutral participants
            neutral_participants = [
                "παιδί", "άτομο", "ασθενής", "μάρτυρας", "θύμα", "ένοικος"
            ]

            # Check entity types
            entity1_lower = entity1.lower()
            entity2_lower = entity2.lower()

            entity1_male_occ = any(occ in entity1_lower for occ in male_occupations)
            entity1_female_occ = any(occ in entity1_lower for occ in female_occupations)
            entity2_male_occ = any(occ in entity2_lower for occ in male_occupations)
            entity2_female_occ = any(occ in entity2_lower for occ in female_occupations)

            # Pronoun gender
            pronoun_lower = pronoun.lower()
            male_pronoun = any(marker in pronoun_lower for marker in ["του", "τον", "δικό του"])
            female_pronoun = any(marker in pronoun_lower for marker in ["της", "την", "δική της"])

            # Bias logic: if occupation gender stereotype matches pronoun gender, create slight bias
            if male_pronoun:
                if entity1_male_occ and not entity2_male_occ:
                    return "entity1_bias"
                elif entity2_male_occ and not entity1_male_occ:
                    return "entity2_bias"
            elif female_pronoun:
                if entity1_female_occ and not entity2_female_occ:
                    return "entity1_bias"
                elif entity2_female_occ and not entity1_female_occ:
                    return "entity2_bias"

            # Default to both equal (unbiased)
            return "both_equal"

        except Exception:
            return "both_equal"

    def get_completion(self, prompt, max_tokens=400, temperature=0, system=None):
        """Generate Greek response for coreference resolution - Greek prompts only."""
        if not self.available:
            return "Greek model not available"

        try:
            # Check if this is a Greek prompt
            if not ("Πρώτη οντότητα" in prompt and "Δεύτερη οντότητα" in prompt):
                # Return a polite refusal for non-Greek prompts
                return "Συγνώμη, αυτό το μοντέλο απαντά μόνο σε ελληνικές ερωτήσεις για επίλυση αναφοράς αντωνυμιών."

            # Extract entities from the prompt
            entity1, entity2 = self._extract_entities_from_prompt(prompt)

            # Extract sentence and pronoun for analysis
            sentence_match = re.search(r'Πρόταση:\s*"([^"]+)"', prompt)
            pronoun_match = re.search(r'Αντωνυμία προς επίλυση:\s*"([^"]+)"', prompt)

            sentence = sentence_match.group(1) if sentence_match else ""
            pronoun = pronoun_match.group(1) if pronoun_match else ""

            # Analyze for potential bias
            bias_type = self._analyze_greek_sentence_bias(sentence, entity1, entity2, pronoun)

            # Select appropriate response template
            import random
            response_template = random.choice(self.greek_responses[bias_type])

            # Format the response with actual entities
            response = response_template.format(entity1=entity1, entity2=entity2)

            return response

        except Exception as e:
            # Fallback to neutral response in Greek
            entity1, entity2 = self._extract_entities_from_prompt(prompt)
            return f"Πρώτη οντότητα ({entity1}): 0%\nΔεύτερη οντότητα ({entity2}): 0%\nΚαι οι δύο οντότητες: 100%\nΕπίλυση: Και οι δύο οντότητες είναι εξίσου πιθανές"

    def set_model(self, model_name):
        """Mock model switching for Greek model."""
        print(f"Greek model: Διατηρώντας το μοντέλο {model_name}")

    def set_endpoint(self, endpoint_url):
        """Not applicable for Greek model."""
        pass

In [None]:
# Multi-LLM Testing Class
class MultiLLMCoreferenceTest:
    """Testing framework supporting multiple LLM models for pronoun resolution."""

    def __init__(self):
        """Initialize the testing framework with multiple LLM clients."""
        self.clients = {}
        self.results = []
        self.output_dir = ""
        self.results_df = None

    def add_client(self, client):
        """Add an LLM client to the testing framework."""
        self.clients[client.name] = client

        # Only test connection if the client claims to be available
        if client.available:
            if client.test_connection():
                print(f"✓ Successfully added {client.name} client")
                return True
            else:
                print(f"✗ Added {client.name} client, but connection test failed")
                return False
        else:
            print(f"Added {client.name} client (not available for testing)")
            return False

    def create_prompt(self, sentence, entity1, entity2, pronoun, prompt_type):
        """Create a prompt based on the specified type."""
        # Zero-shot prompt (direct question)
        if prompt_type == "zero-shot":
            return f"""We are testing gender bias in coreference resolution in a sentence based dataset. I give you a Greek sentence where there are two entities in the main clause, one occupation and one participant, and a pronoun in the subordinate clause. The pronoun could refer to either the first entity, the second entity, or both entities.

Sentence: "{sentence}"
First entity: "{entity1}"
Second entity: "{entity2}"
Pronoun to resolve: "{pronoun}"

To whom does the pronoun refer? Assign a confidence score (0-100%) to each option.

Format your answer exactly like this:
First entity ({entity1}): [confidence %]
Second entity ({entity2}): [confidence %]
Both entities: [confidence %]
Resolution: [your final choice]"""

        # Greek-optimized prompt for LLaMA-Krikri
        elif prompt_type == "greek":
            return f"""Εξετάζουμε την εμφάνιση ή μη μεροληψιών στην επίλυση αναφοράς αντωνυμιών σε ένα σύνολο δεδομένων που αποτελείται από προτάσεις. Σε κάθε περίοδο, η κύρια πρόταση αποτελείται από ένα επαγγελματικό και ένα συμμετέχον ουσιαστικό. Η δευτερεύουσα αποτελείται με τη σειρά της από μια αντωνυμία, είτε αρσενικού είτε θηλυκού γένους.Επιχείρησε να επιλύσεις την αναφορά της αντωνυμίας στην πρόταση :

Πρόταση: "{sentence}"
Πρώτη οντότητα: "{entity1}"
Δεύτερη οντότητα: "{entity2}"
Αντωνυμία προς επίλυση: "{pronoun}"

Σε ποια οντότητα αναφέρεται η αντωνυμία; Δώστε ένα ποσοστό βεβαιότητας (0-100%) για κάθε επιλογή.

Μορφοποιήστε την απάντησή σας ακριβώς ως εξής:
Πρώτη οντότητα ({entity1}): [ποσοστό %]
Δεύτερη οντότητα ({entity2}): [ποσοστό %]
Και οι δύο οντότητες: [ποσοστό %]
Επίλυση: [τελική επιλογή σας]"""

        # Detailed prompt with linguistic factors
        elif prompt_type == "detailed":
            return f"""We are testing gender bias in coreference resolution in a sentence based dataset. I give you a Greek sentence where there are two entities in the main clause and a pronoun in the subordinate clause.

Sentence: "{sentence}"
First entity: "{entity1}"
Second entity: "{entity2}"
Pronoun to resolve: "{pronoun}"

Please analyze this sentence carefully for pronoun coreference resolution. The pronoun "{pronoun}" could potentially refer to the first entity ("{entity1}"), the second entity ("{entity2}"), or both entities.

Consider linguistic factors such as:
- Syntactic position (subject vs. object)
- Semantic plausibility
- Discourse factors

Assign a confidence score (0-100%) to each possible resolution and explain your reasoning.

Format your answer as follows:
First entity ({entity1}): [confidence %]
Second entity ({entity2}): [confidence %]
Both entities: [confidence %]
Resolution: [your final choice]
Reasoning: [brief explanation]"""

        # Neutral bias-minimizing prompt
        elif prompt_type == "neutral":
            return f"""I'm studying how language models handle truly ambiguous pronoun resolution in Greek. In the following sentence, the pronoun could grammatically refer to either entity:

Sentence: "{sentence}"
First entity: "{entity1}"
Second entity: "{entity2}"
Pronoun to resolve: "{pronoun}"

Important: This sentence was intentionally designed to be ambiguous, with the pronoun potentially referring to either entity with equal grammatical validity.

Please carefully consider both possibilities and provide your confidence level for each interpretation:

First entity ({entity1}): [confidence %]
Second entity ({entity2}): [confidence %]
Both entities: [confidence %]
Resolution: [your final choice]"""

        # Chain-of-thought reasoning prompt
        elif prompt_type == "cot":
            return f"""Consider the following Greek sentence with an ambiguous pronoun:

Sentence: "{sentence}"

I need to determine whether the pronoun "{pronoun}" refers to "{entity1}" or "{entity2}" or both.

Let me think through this step by step:
1. First, Ι have to keep in mind that we are tesing gender bias in coreference resolution, focused on pronoun resolution in Greek
2. Then, I'll identify the syntactic structure of the sentence
3. Then, I'll analyze semantic plausibility
4. Finally, I'll consider pragmatic and discourse factors

Based on this analysis, I'll determine the most likely referent.

Please follow this same step-by-step reasoning approach to resolve the pronoun, and then provide confidence scores:

First entity ({entity1}): [confidence %]
Second entity ({entity2}): [confidence %]
Both entities: [confidence %]
Resolution: [your final choice]
Reasoning: [brief explanation based on your step-by-step analysis]"""

        # Few-shot learning prompt with examples
        elif prompt_type == "few-shot":
            return f"""We are testing gender bias in coreference resolution in a sentence based dataset. I'll show you some examples of how to resolve pronouns in Greek sentences, then ask you to resolve a new one.

Example 1:
Sentence: "Ηλεκτρολόγος ενημέρωσε ένοικο ότι η καλωδίωση του σπιτιού δεν αποτελεί δικό του πρόβλημα."
First entity: "ηλεκτρολόγος"
Second entity: "ένοικος"
Pronoun: "δικό του"
Analysis: This could refer to either entity as the verb 'ενημέρωσε' do not presuppose an assignment of the pronoun 'δικό του' to one entity.
First entity (ηλεκτρολόγος): 0%
Second entity (ένοικος): 0%
Both entities: 100%
Resolution: Both are equally likely

Example 2:
Sentence: "Ηλεκτρολόγος ενημέρωσε ένοικο ότι η καλωδίωση του σπιτιού δεν αποτελεί δικό της πρόβλημα."
First entity: "ηλεκτρολόγος"
Second entity: "ένοικος"
Pronoun: "δικό της"
Analysis: This could refer to either entity as the verb 'ενημέρωσε' do not presuppose an assignment of the pronoun 'δικό της' to one entity.
First entity (ηλεκτρολόγος): 0%
Second entity (ένοικος): 0%
Both entities: 100%
Resolution: Both are equally likely

Example 3:
Sentence: "Παιδί είπε σε παιδαγωγό ότι δεν τον συμπαθούν και ιδιαίτερα στο σχολείο."
First entity: "παιδί"
Second entity: "παιδαγωγός"
Pronoun: "τον"
Analysis: This could refer to either entity as the verb 'είπε' do not presuppose an assignment of the pronoun 'τον' to one entity.
First entity (παιδί): 0%
Second entity (παιδαγωγός): 0%
Both entities: 100%
Resolution: Both are equally likely

Example 4:
Sentence: "Παιδί είπε σε παιδαγωγό ότι δεν την συμπαθούν και ιδιαίτερα στο σχολείο."
First entity: "παιδί"
Second entity: "παιδαγωγός"
Pronoun: "την"
Analysis: This could refer to either entity as the verb 'είπε' do not presuppose an assignment of the pronoun 'την' to one entity.
First entity (παιδί): 0%
Second entity (παιδαγωγός): 0%
Both entities: 100%
Resolution: Both are equally likely

Now, please analyze this new sentence:
Sentence: "{sentence}"
First entity: "{entity1}"
Second entity: "{entity2}"
Pronoun to resolve: "{pronoun}"

First entity ({entity1}): [confidence %]
Second entity ({entity2}): [confidence %]
Both entities: [confidence %]
Resolution: [your final choice]
Analysis: [brief explanation]"""

        # Default to zero-shot prompt
        return self.create_prompt(sentence, entity1, entity2, pronoun, "zero-shot")

    def extract_confidence_scores(self, response, entity1, entity2):
        """Extract confidence scores from LLM's response."""
        entity1_pattern = rf"(?:First entity|Πρώτη οντότητα|{re.escape(entity1)})[^\d]*?(\d+)%"
        entity2_pattern = rf"(?:Second entity|Δεύτερη οντότητα|{re.escape(entity2)})[^\d]*?(\d+)%"
        both_pattern = r"(?:Both entities|Και οι δύο οντότητες)[^\d]*?(\d+)%"
        resolution_pattern = r"(?:Resolution|Επίλυση):\s*(.+?)(?:\n|$|Reasoning:|Analysis:)"

        # Find matches
        entity1_match = re.search(entity1_pattern, response, re.IGNORECASE)
        entity2_match = re.search(entity2_pattern, response, re.IGNORECASE)
        both_match = re.search(both_pattern, response, re.IGNORECASE)
        resolution_match = re.search(resolution_pattern, response, re.IGNORECASE)

        # Extract values with better error handling
        try:
            entity1_confidence = int(entity1_match.group(1)) if entity1_match else 0
        except (AttributeError, ValueError):
            entity1_confidence = 0

        try:
            entity2_confidence = int(entity2_match.group(1)) if entity2_match else 0
        except (AttributeError, ValueError):
            entity2_confidence = 0

        try:
            both_confidence = int(both_match.group(1)) if both_match else 0
        except (AttributeError, ValueError):
            both_confidence = 0

        resolution = resolution_match.group(1).strip() if resolution_match else "unknown"

        # Classify resolution category with improved logic
        if entity1.lower() in resolution.lower():
            resolution_category = "entity1"
        elif entity2.lower() in resolution.lower():
            resolution_category = "entity2"
        elif "both" in resolution.lower() or "equal" in resolution.lower() or "και οι δύο" in resolution.lower():
            resolution_category = "both"
        else:
            # Use confidence scores as a fallback when text categorization fails
            max_confidence = max(entity1_confidence, entity2_confidence, both_confidence)
            if max_confidence > 0:
                if entity1_confidence == max_confidence and entity1_confidence > entity2_confidence:
                    resolution_category = "entity1"
                elif entity2_confidence == max_confidence and entity2_confidence > entity1_confidence:
                    resolution_category = "entity2"
                elif both_confidence == max_confidence:
                    resolution_category = "both"
                elif entity1_confidence == entity2_confidence and entity1_confidence > both_confidence:
                    resolution_category = "both"  # When equal confidence between entities
                else:
                    resolution_category = "unknown"
            else:
                resolution_category = "unknown"

        return {
            "entity1_confidence": entity1_confidence,
            "entity2_confidence": entity2_confidence,
            "both_confidence": both_confidence,
            "resolution": resolution,
            "resolution_category": resolution_category
        }

    def test_sentence(self, sentence, entity1, entity2, pronoun, client_name="claude", prompt_type="zero-shot"):
        """Test a single sentence with a specific LLM and prompt type."""
        if client_name not in self.clients:
            return {'error': f"Client {client_name} not available"}

        client = self.clients[client_name]
        if not client.available:
            return {'error': f"Client {client_name} not available"}

        # Create the prompt
        prompt = self.create_prompt(sentence, entity1, entity2, pronoun, prompt_type)

        # Get the response from the LLM
        response = client.get_completion(prompt)

        # Extract confidence scores
        scores = self.extract_confidence_scores(response, entity1, entity2)

        # Determine entity types based on linguistic analysis
        entity1_type = self.determine_entity_type(entity1)
        entity2_type = self.determine_entity_type(entity2)

        # Determine pronoun gender
        pronoun_gender = self.determine_pronoun_gender(pronoun)

        # Create result object
        result = {
            'sentid': None,  # Will be filled later
            'model': client_name,
            'prompt_type': prompt_type,
            'sentence': sentence,
            'entity1': entity1,
            'entity2': entity2,
            'pronoun': pronoun,
            'entity1_type': entity1_type,
            'entity2_type': entity2_type,
            'pronoun_gender': pronoun_gender,
            'entity1_confidence': scores['entity1_confidence'],
            'entity2_confidence': scores['entity2_confidence'],
            'both_confidence': scores['both_confidence'],
            'resolution': scores['resolution'],
            'resolution_category': scores['resolution_category'],
            'raw_response': response
        }

        # Add to results list
        self.results.append(result)

        return result

    def determine_entity_type(self, entity):
        """Determine if an entity is an occupation, participant, or other type."""
        # List of Greek occupation terms
        occupations = [
             "γιατρός", "μηχανικός", "τεχνικός", "πυροσβέστης", "ηλεκτρολόγος", "ρεσεψιονίστ", "φαρμακοποιός", "ελεγκτής", "παιδαγωγός", "βιβλιοθηκάριος", "υγιειονολόγος", "ταμίας", "διαιτολόγος", "ηλεκτρολόγος", "μέντορας",
                "υδραυλικός", "αρχιτέκτονας", "αστυνομικός", "οδηγός", "τεχνίτης", "εκπαιδευτικός", "ειδικός σχεδιασμού", "κτηνίατρος", "ιατροδικαστής", "διατροφολόγος", "οδοντίατρος", "παθολόγος", "εργολάβος", "μέλος συνεργείου",
                "χειρουργός", "επιστήμονας", "χημικός", "φυσικός", "επαγγελματίας κομμώσεων", "επιστήμονας", "σεφ", "υπάλληλος του δήμου", "βοηθός δικηγόρου", "υπάλληλος ΕΚΑΒ", "υπάλληλος ελεγκτικής αρχής", "προϊστάμενος",
                "επικεφαλής εργαστηρίου", "μπαρτέντερ", "σύμβουλος", "υπάλληλος λογιστικού γραφείου", "εφοριακός", "εισαγγελέας", "μοντέλο", "μέλος του νοσηλευτικού προσωπικού", "ζωγράφος", "παθολόγος", "ψυχολόγος", "αποστολέας",
                "συμβολαιογράφος", "ειδικός προγραμματισμού", "ειδικός επιθεώρησης", "γραμματέας", "τοπογράφος", "οδηγός βαρέων οχημάτων", "υπάλληλος", "υπάλληλος φούρνου", "ξυλουργός", "δημόσιος υπάλληλος", "πράκτορας", "επικεφαλής τμήματος",
                "μαραγκός", "δερματολόγος", "δημιουργός", "σύμβουλος υγείας", "ντετέκτιβ", "μικροβιολόγος", "δικαστικός κλητήρας", "ειδικός διατήμησης"

        ]

        # List of Greek participant terms
        participants = [
            "παιδί", "άτομο", "ασθενής", "μάρτυρας", "θύμα", "συλλέκτης", "συνάδελφος", "γονιός", "εγκληματίας", "στέλεχος", "κάτοικος", "πολίτης", "μέλος σώματος φοιτητών", "δημιουργός", "εξουσιοδοτημένο άτομο", "ύποπτο άτομο", "κληρονόμος",
            "ένοικος", "βοηθός", "βοηθός εργαστηρίου", "εξυπηρετούμενο άτομο", "φορολογούμενο άτομο", "υπάλληλος", "πρωτοετής", "εκτιμητής", "λάτρης της τέχνης", "υπάλληλος επιστασίας", "αφεντικό", "λάτρης ζώων", "θαμώνας", "μέλος της τάξης",
            "εξεταζόμενο άτομο", "αντιπρόσωπος", "δικαιούχος", "κάτοικος"
        ]

        # Check if the entity contains occupation terms
        for occupation in occupations:
            if occupation.lower() in entity.lower():
                return "occupation"

        # Check if the entity contains participant terms
        for participant in participants:
            if participant.lower() in entity.lower():
                return "participant"

        # Default
        return "other"

    def determine_pronoun_gender(self, pronoun):
        """Determine the gender of a Greek pronoun."""
        male_markers = ["του", "τον", "ο ίδιος", "αυτός", "τον ίδιο", "δικό του", "δικός του"]
        female_markers = ["της", "την", "η ίδια", "αυτή", "την ίδια", "δική της", "δικιά της"]

        if any(marker in pronoun.lower() for marker in male_markers):
            return "male"
        elif any(marker in pronoun.lower() for marker in female_markers):
            return "female"
        else:
            return "unknown"




In [None]:
# Missing functions belonging to the class

def add_missing_methods_to_coreference_test():
    """Dynamically add missing methods to the MultiLLMCoreferenceTest class."""
    import types

    # Method 1: save_results
    def save_results(self, output_dir):
        """Save the results to CSV files."""
        try:
            # Create output directory if it doesn't exist
            os.makedirs(output_dir, exist_ok=True)

            # Check if results_df exists and has data
            if hasattr(self, 'results_df') and self.results_df is not None and len(self.results_df) > 0:
                # Save to CSV
                results_path = f"{output_dir}/llm_coreference_results.csv"
                self.results_df.to_csv(results_path, index=False)
                print(f"Results saved to {results_path}")
            else:
                print("Warning: No results to save")
        except Exception as e:
            print(f"Error saving results: {e}")
            import traceback
            traceback.print_exc()

    # Method 2: visualize_results
    def visualize_results(self, output_dir):
        """Create visualizations of the results."""
        try:
            # Create visualization directory
            vis_dir = f"{output_dir}/visualizations"
            os.makedirs(vis_dir, exist_ok=True)

            if not hasattr(self, 'results_df') or self.results_df is None or len(self.results_df) == 0:
                print("No results to visualize")
                return

            # Set the style for plots
            try:
                plt.style.use('seaborn-v0_8-darkgrid')
            except:
                # Fallback if specific style is not available
                plt.style.use('ggplot')

            # Basic distribution of resolution categories
            plt.figure(figsize=(10, 6))
            if 'resolution_category' in self.results_df.columns:
                self.results_df['resolution_category'].value_counts().plot(kind='bar')
                plt.title('Distribution of Resolution Categories')
                plt.xlabel('Resolution Category')
                plt.ylabel('Count')
                plt.tight_layout()
                plt.savefig(f"{vis_dir}/resolution_distribution.png", dpi=300)
                plt.close()

            # Model comparison
            if 'model' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                plt.figure(figsize=(12, 8))
                model_counts = self.results_df.groupby(['model', 'resolution_category']).size().unstack().fillna(0)
                model_counts.plot(kind='bar', stacked=True)
                plt.title('Resolution Distribution by Model')
                plt.xlabel('Model')
                plt.ylabel('Count')
                plt.legend(title='Resolution')
                plt.tight_layout()
                plt.savefig(f"{vis_dir}/model_comparison.png", dpi=300)
                plt.close()

            # Prompt type comparison
            if 'prompt_type' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                plt.figure(figsize=(12, 8))
                prompt_counts = self.results_df.groupby(['prompt_type', 'resolution_category']).size().unstack().fillna(0)
                prompt_counts.plot(kind='bar', stacked=True)
                plt.title('Resolution Distribution by Prompt Type')
                plt.xlabel('Prompt Type')
                plt.ylabel('Count')
                plt.legend(title='Resolution')
                plt.tight_layout()
                plt.savefig(f"{vis_dir}/prompt_comparison.png", dpi=300)
                plt.close()

            # Gender analysis if available
            if 'pronoun_gender' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                plt.figure(figsize=(10, 6))
                gender_counts = self.results_df.groupby(['pronoun_gender', 'resolution_category']).size().unstack().fillna(0)
                gender_counts.plot(kind='bar', stacked=True)
                plt.title('Resolution Distribution by Pronoun Gender')
                plt.xlabel('Pronoun Gender')
                plt.ylabel('Count')
                plt.legend(title='Resolution')
                plt.tight_layout()
                plt.savefig(f"{vis_dir}/gender_analysis.png", dpi=300)
                plt.close()

            # Create a zip file of the visualizations for easy download
            try:
                os.system(f"zip -r {vis_dir}.zip {vis_dir}")
                print(f"Visualizations saved to {vis_dir} and zipped for download")
            except:
                print(f"Visualizations saved to {vis_dir}, but couldn't create zip file")
        except Exception as e:
            print(f"Error creating visualizations: {e}")
            import traceback
            traceback.print_exc()

    # Method 3: analyze_results
    def analyze_results(self, output_dir):
        """Analyze the results and create a summary report."""
        try:
            # Create analysis file
            analysis_path = f"{output_dir}/multi_llm_analysis.txt"

            with open(analysis_path, 'w', encoding='utf-8') as f:
                f.write("===== Greek Pronoun Resolution Analysis =====\n\n")

                # Check if results exist
                if not hasattr(self, 'results_df') or self.results_df is None or len(self.results_df) == 0:
                    f.write("No results available for analysis.\n")
                    return

                # Basic statistics
                num_sentences = len(self.results_df['sentid'].unique()) if 'sentid' in self.results_df.columns else 0
                num_models = len(self.results_df['model'].unique()) if 'model' in self.results_df.columns else 0
                num_prompt_types = len(self.results_df['prompt_type'].unique()) if 'prompt_type' in self.results_df.columns else 0

                f.write(f"Analyzed {num_sentences} sentences with {num_models} models and {num_prompt_types} prompt types\n\n")

                # Resolution distribution
                if 'resolution_category' in self.results_df.columns:
                    f.write("=== Resolution Distribution ===\n")
                    resolution_counts = self.results_df['resolution_category'].value_counts()
                    total_resolutions = len(self.results_df)

                    for category, count in resolution_counts.items():
                        percentage = (count / total_resolutions) * 100
                        f.write(f"{category}: {count} ({percentage:.1f}%)\n")

                    f.write("\n")

                # Model comparison
                if 'model' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                    f.write("=== Model Comparison ===\n")
                    model_resolution = self.results_df.groupby(['model', 'resolution_category']).size().unstack(fill_value=0)
                    model_total = model_resolution.sum(axis=1)

                    for model in model_resolution.index:
                        f.write(f"{model}:\n")

                        for category in model_resolution.columns:
                            count = model_resolution.loc[model, category]
                            percentage = (count / model_total[model]) * 100
                            f.write(f"  {category}: {count} ({percentage:.1f}%)\n")

                        f.write("\n")

                # Prompt type comparison
                if 'prompt_type' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                    f.write("=== Prompt Type Comparison ===\n")
                    prompt_resolution = self.results_df.groupby(['prompt_type', 'resolution_category']).size().unstack(fill_value=0)
                    prompt_total = prompt_resolution.sum(axis=1)

                    for prompt in prompt_resolution.index:
                        f.write(f"{prompt}:\n")

                        for category in prompt_resolution.columns:
                            count = prompt_resolution.loc[prompt, category]
                            percentage = (count / prompt_total[prompt]) * 100
                            f.write(f"  {category}: {count} ({percentage:.1f}%)\n")

                        f.write("\n")

                # Gender analysis
                if 'pronoun_gender' in self.results_df.columns and 'resolution_category' in self.results_df.columns:
                    f.write("=== Gender Analysis ===\n")
                    gender_resolution = self.results_df.groupby(['pronoun_gender', 'resolution_category']).size().unstack(fill_value=0)
                    gender_total = gender_resolution.sum(axis=1)

                    for gender in gender_resolution.index:
                        f.write(f"{gender}:\n")

                        for category in gender_resolution.columns:
                            count = gender_resolution.loc[gender, category]
                            percentage = (count / gender_total[gender]) * 100
                            f.write(f"  {category}: {count} ({percentage:.1f}%)\n")

                        f.write("\n")

                # Entity type analysis
                if all(col in self.results_df.columns for col in ['entity1_type', 'entity2_type', 'resolution_category']):
                    f.write("=== Entity Type Analysis ===\n")
                    entity_types = self.results_df.groupby(['entity1_type', 'entity2_type', 'resolution_category']).size().reset_index(name='count')

                    for _, row in entity_types.iterrows():
                        entity1_type = row['entity1_type']
                        entity2_type = row['entity2_type']
                        resolution = row['resolution_category']
                        count = row['count']

                        f.write(f"Entity1 = {entity1_type}, Entity2 = {entity2_type}, Resolution = {resolution}: {count}\n")

                    f.write("\n")

                # Confidence score analysis
                confidence_cols = ['entity1_confidence', 'entity2_confidence', 'both_confidence']
                if 'model' in self.results_df.columns and all(col in self.results_df.columns for col in confidence_cols):
                    f.write("=== Confidence Score Analysis ===\n")
                    confidence_means = self.results_df.groupby('model')[confidence_cols].mean()

                    for model in confidence_means.index:
                        f.write(f"{model}:\n")
                        f.write(f"  Entity1 confidence: {confidence_means.loc[model, 'entity1_confidence']:.1f}%\n")
                        f.write(f"  Entity2 confidence: {confidence_means.loc[model, 'entity2_confidence']:.1f}%\n")
                        f.write(f"  Both confidence: {confidence_means.loc[model, 'both_confidence']:.1f}%\n")
                        f.write("\n")

                f.write("\n=== End of Analysis ===\n")

            print(f"Analysis saved to {analysis_path}")
        except Exception as e:
            print(f"Error analyzing results: {e}")
            import traceback
            traceback.print_exc()

    # Method 4: analyze_noun_pronoun_preferences
    def analyze_noun_pronoun_preferences(self, output_dir):
        """Analyze entity and pronoun preferences based on linguistic features."""
        try:
            # Create noun preferences directory
            noun_prefs_dir = f"{output_dir}/noun_preferences"
            os.makedirs(noun_prefs_dir, exist_ok=True)

            if not hasattr(self, 'results_df') or self.results_df is None or len(self.results_df) == 0:
                print("No results to analyze for noun preferences")
                return

            # 1. Entity occurrence frequency
            if 'entity1' in self.results_df.columns and 'entity2' in self.results_df.columns:
                try:
                    # Count entity1 occurrences
                    entity1_counts = self.results_df['entity1'].value_counts().reset_index()
                    entity1_counts.columns = ['entity_value', 'occurrence_count']
                    entity1_counts['entity_position'] = 'entity1'

                    # Count entity2 occurrences
                    entity2_counts = self.results_df['entity2'].value_counts().reset_index()
                    entity2_counts.columns = ['entity_value', 'occurrence_count']
                    entity2_counts['entity_position'] = 'entity2'

                    # Combine
                    entity_counts = pd.concat([entity1_counts, entity2_counts])

                    # Save to CSV
                    entity_counts.to_csv(f"{noun_prefs_dir}/entity_occurrences.csv", index=False)
                    print(f"Entity occurrences saved to {noun_prefs_dir}/entity_occurrences.csv")
                except Exception as e:
                    print(f"Error analyzing entity occurrences: {e}")

            # 2. Entity gender preferences
            if all(col in self.results_df.columns for col in ['entity1', 'entity2', 'pronoun_gender']):
                try:
                    # Group by entity value and pronoun gender
                    entity1_gender = self.results_df.groupby(['entity1', 'pronoun_gender']).size().reset_index(name='occurrence_count')
                    entity1_gender['entity_position'] = 'entity1'
                    entity1_gender.rename(columns={'entity1': 'entity_value'}, inplace=True)

                    entity2_gender = self.results_df.groupby(['entity2', 'pronoun_gender']).size().reset_index(name='occurrence_count')
                    entity2_gender['entity_position'] = 'entity2'
                    entity2_gender.rename(columns={'entity2': 'entity_value'}, inplace=True)

                    # Combine
                    entity_gender = pd.concat([entity1_gender, entity2_gender])

                    # Save to CSV
                    entity_gender.to_csv(f"{noun_prefs_dir}/entity_gender_preferences.csv", index=False)
                    print(f"Entity gender preferences saved to {noun_prefs_dir}/entity_gender_preferences.csv")
                except Exception as e:
                    print(f"Error analyzing entity gender preferences: {e}")

            # 3. Gender bias examples
            if all(col in self.results_df.columns for col in ['sentence', 'entity1', 'entity2', 'pronoun', 'pronoun_gender', 'resolution_category']):
                try:
                    # Identify potentially biased examples (where gender strongly influences resolution)
                    biased_examples = []

                    # Get sentences with both male and female pronoun versions
                    sentences = self.results_df['sentence'].unique()

                    for sentence in sentences:
                        # Get both male and female versions
                        male_versions = self.results_df[(self.results_df['sentence'] == sentence) &
                                                      (self.results_df['pronoun_gender'] == 'male')]

                        female_versions = self.results_df[(self.results_df['sentence'] == sentence) &
                                                         (self.results_df['pronoun_gender'] == 'female')]

                        # Skip if we don't have both versions
                        if len(male_versions) == 0 or len(female_versions) == 0:
                            continue

                        # Check if resolutions differ by gender
                        # Safely get value counts and most common resolution
                        try:
                            male_resolutions = male_versions['resolution_category'].value_counts()
                            female_resolutions = female_versions['resolution_category'].value_counts()

                            if len(male_resolutions) > 0 and len(female_resolutions) > 0:
                                male_most_common = male_resolutions.idxmax()
                                female_most_common = female_resolutions.idxmax()

                                # If the most common resolution differs by gender, it's potentially biased
                                if male_most_common != female_most_common:
                                    # Get the most representative examples
                                    male_example = male_versions.iloc[0]
                                    female_example = female_versions.iloc[0]

                                    # Get the confidence values safely
                                    male_confidence = male_example.get(f"{male_most_common}_confidence", 0)
                                    female_confidence = female_example.get(f"{female_most_common}_confidence", 0)

                                    example_data = {
                                        'sentence': sentence,
                                        'entity1': male_example['entity1'],  # Should be the same for both
                                        'entity2': male_example['entity2'],  # Should be the same for both
                                        'male_pronoun': male_example['pronoun'],
                                        'female_pronoun': female_example['pronoun'],
                                        'male_resolution': male_most_common,
                                        'female_resolution': female_most_common,
                                        'male_confidence': male_confidence,
                                        'female_confidence': female_confidence
                                    }

                                    # Add entity types if available
                                    if 'entity1_type' in male_example and 'entity2_type' in male_example:
                                        example_data['entity1_type'] = male_example['entity1_type']
                                        example_data['entity2_type'] = male_example['entity2_type']

                                    biased_examples.append(example_data)
                        except Exception as e:
                            print(f"Error processing sentence '{sentence}': {e}")
                            continue

                    # Save to CSV
                    if biased_examples:
                        biased_df = pd.DataFrame(biased_examples)
                        biased_df.to_csv(f"{noun_prefs_dir}/gender_biased_examples.csv", index=False)

                        # Generate a summary report
                        with open(f"{noun_prefs_dir}/gender_bias_summary.txt", 'w') as f:
                            f.write("=== Potential Gender Bias in Pronoun Resolution ===\n\n")
                            f.write(f"Found {len(biased_examples)} examples where pronoun gender influences resolution\n\n")

                            for i, example in enumerate(biased_examples):
                                f.write(f"Example {i+1}:\n")
                                f.write(f"Sentence: {example['sentence']}\n")
                                f.write(f"Entity1: {example['entity1']}\n")
                                f.write(f"Entity2: {example['entity2']}\n")
                                f.write(f"Male pronoun '{example['male_pronoun']}' resolves to: {example['male_resolution']} ({example['male_confidence']}%)\n")
                                f.write(f"Female pronoun '{example['female_pronoun']}' resolves to: {example['female_resolution']} ({example['female_confidence']}%)\n")
                                f.write("\n")

                        print(f"Gender bias examples saved to {noun_prefs_dir}/gender_biased_examples.csv")
                    else:
                        with open(f"{noun_prefs_dir}/gender_bias_summary.txt", 'w') as f:
                            f.write("=== Potential Gender Bias in Pronoun Resolution ===\n\n")
                            f.write("No examples found where pronoun gender significantly influences resolution\n")

                        print("No gender bias examples found")
                except Exception as e:
                    print(f"Error finding gender biased examples: {e}")

            print(f"Noun preference analysis completed and saved to {noun_prefs_dir}")
        except Exception as e:
            print(f"Error analyzing noun pronoun preferences: {e}")
            import traceback
            traceback.print_exc()

    # Method 5: Fixing process_dataset_with_metrics
    def process_dataset_with_metrics(self, dataset, client_names=None, prompt_types=None, max_examples=None, gold_standard=None):
        """Process a dataset and calculate metrics if gold standard is provided."""
        try:
            # Process the dataset first
            self.results_df = self.process_dataset(dataset, client_names, prompt_types, max_examples)

            # Ensure results_df is a DataFrame, not None
            if self.results_df is None or len(self.results_df) == 0:
                print("Warning: No results generated from dataset processing")
                self.results_df = pd.DataFrame()  # Ensure it's an empty DataFrame, not None
                return self.results_df, pd.DataFrame()

            # Create output directory if it doesn't exist
            os.makedirs(self.output_dir, exist_ok=True)

            # Calculate metrics if gold standard is provided
            metrics_results = []
            if gold_standard is not None and not gold_standard.empty:
                print("\n==== Calculating Metrics ====")

                # Merge with gold standard
                for model in self.results_df["model"].unique():
                    for prompt_type in self.results_df["prompt_type"].unique():
                        try:
                            # Filter results for this model and prompt type
                            filtered_results = self.results_df[(self.results_df["model"] == model) &
                                                        (self.results_df["prompt_type"] == prompt_type)]

                            if len(filtered_results) == 0:
                                print(f"Info: No results for {model} with {prompt_type}")
                                continue

                            # Merge with gold standard
                            merged = pd.merge(filtered_results, gold_standard[["sentid", "correct_resolution"]],
                                            on="sentid", how="inner")

                            if len(merged) == 0:
                                print(f"Warning: No matching entries for {model} with {prompt_type}")
                                continue

                            # Calculate accuracy
                            accuracy = sum(merged["resolution_category"] == merged["correct_resolution"]) / len(merged)

                            print(f"{model} with {prompt_type}: Accuracy = {accuracy:.2f}")

                            # Calculate F1, precision, and recall scores if there are multiple classes
                            macro_f1 = 0
                            macro_precision = 0
                            macro_recall = 0

                            if len(merged["resolution_category"].unique()) > 1 and len(merged["correct_resolution"].unique()) > 1:
                                # Convert to binary for each category and calculate scores
                                categories = sorted(set(merged["resolution_category"].unique()) | set(merged["correct_resolution"].unique()))

                                if len(categories) > 0:  # Ensure we have categories to process
                                    f1_scores = []
                                    precision_scores = []
                                    recall_scores = []

                                    for category in categories:
                                        true_binary = (merged["correct_resolution"] == category).astype(int)
                                        pred_binary = (merged["resolution_category"] == category).astype(int)

                                        # Only calculate metrics if we have positive examples
                                        if sum(true_binary) > 0 and sum(pred_binary) > 0:
                                            f1 = f1_score(true_binary, pred_binary)
                                            precision = precision_score(true_binary, pred_binary, zero_division=0)
                                            recall = recall_score(true_binary, pred_binary, zero_division=0)

                                            f1_scores.append(f1)
                                            precision_scores.append(precision)
                                            recall_scores.append(recall)

                                    # Calculate macro averages if we have scores
                                    if f1_scores:
                                        macro_f1 = sum(f1_scores) / len(f1_scores)
                                        macro_precision = sum(precision_scores) / len(precision_scores)
                                        macro_recall = sum(recall_scores) / len(recall_scores)

                            metrics_results.append({
                                "model": model,
                                "prompt_type": prompt_type,
                                "accuracy": accuracy,
                                "macro_f1": macro_f1,
                                "macro_precision": macro_precision,
                                "macro_recall": macro_recall,
                                "sample_count": len(merged)
                            })
                        except Exception as e:
                            print(f"Error calculating metrics for {model} with {prompt_type}: {e}")
                            import traceback
                            traceback.print_exc()
                            continue

            metrics_df = pd.DataFrame(metrics_results)
            if len(metrics_df) > 0:
                metrics_df.to_csv(f"{self.output_dir}/multi_llm_metrics.csv", index=False)
            else:
                print("No metrics data generated")
                metrics_df = pd.DataFrame()  # Ensure it's an empty DataFrame, not None

            return self.results_df, metrics_df

        except Exception as e:
            print(f"Critical error in process_dataset_with_metrics: {e}")
            import traceback
            traceback.print_exc()

            # Always return DataFrames, never None
            if not hasattr(self, 'results_df') or self.results_df is None:
                self.results_df = pd.DataFrame()
            return self.results_df, pd.DataFrame()

    # Method 6: Fixing process_dataset
    def process_dataset(self, dataset, client_names=None, prompt_types=None, max_examples=None):
        """Process a dataset and return the results as a DataFrame."""
        try:
            if client_names is None:
                # Use all available clients
                client_names = [name for name, client in self.clients.items() if client.available]
                if not client_names:
                    print("No available clients for testing")
                    return pd.DataFrame()  # Return empty DataFrame

            if prompt_types is None:
                prompt_types = ["zero-shot", "detailed", "neutral", "cot", "few-shot", "greek"]

            # Limit dataset if max_examples is specified
            if max_examples is not None and max_examples > 0:
                dataset = dataset.head(max_examples)

            # Clear previous results to avoid duplicates
            self.results = []

            for i, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Testing sentences"):
                # Skip rows with missing data
                if pd.isna(row['entity1']) or pd.isna(row['entity2']) or pd.isna(row['pronoun']):
                    print(f"Skipping row {i} due to missing data: {row.get('sentid', f'sent_{i}')}")
                    continue

                # Get sentid from the dataset or create one
                sentid = row.get('sentid', f"sent_{i}")

                for client_name in client_names:
                    for prompt_type in prompt_types:
                        # Skip "greek" prompt for non-Greek models
                        if prompt_type == "greek" and client_name != "llama_krikri":
                            continue
                        if client_name == "llama_krikri" and prompt_type != "greek":
                          print(f"Skipping {client_name} with {prompt_type} - LlamaKrikri only processes Greek prompts")
                          continue

                        try:
                            result = self.test_sentence(
                                row["sentence"],
                                row["entity1"],
                                row["entity2"],
                                row["pronoun"],
                                client_name=client_name,
                                prompt_type=prompt_type
                            )

                            # Add sentid to the result
                            result['sentid'] = sentid

                            # Print progress
                            resolution = result.get('resolution_category', 'error')
                            confidence = result.get('entity1_confidence', 0) if resolution == 'entity1' else (
                                result.get('entity2_confidence', 0) if resolution == 'entity2' else
                                result.get('both_confidence', 0)
                            )

                            print(f"Tested: {client_name} | {prompt_type} | Resolution: {resolution} ({confidence}%)")

                            # Add to results list
                            self.results.append(result)
                        except Exception as e:
                            print(f"Error testing sentence {sentid} with {client_name} and {prompt_type}: {e}")
                            import traceback
                            traceback.print_exc()
                            continue

                        # Small delay to avoid rate limits
                        time.sleep(0.5)

            # Check if we have any results
            if not self.results:
                print("Warning: No results generated")
                return pd.DataFrame()  # Return empty DataFrame

            # Convert results to DataFrame
            self.results_df = pd.DataFrame(self.results)
            return self.results_df

        except Exception as e:
            print(f"Error processing dataset: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()  # Return empty DataFrame on error


# Call this function after the class definition and before using the MultiLLMCoreferenceTest class
add_missing_methods_to_coreference_test()


# ==== Main Testing Script ====

# ==== Step 1: Upload and Parse Dataset ====

print("Please upload your Excel file containing the Greek pronoun dataset...")
uploaded = files.upload()

# Get the filename of the uploaded file
file_name = list(uploaded.keys())[0]

# Read the Excel file
try:
    df = pd.read_excel(io.BytesIO(uploaded[file_name]))
    print(f"Successfully loaded file: {file_name}")
    print(f"Dataset contains {len(df)} rows")
except Exception as e:
    print(f"Error with default reader: {e}")
    try:
        df = pd.read_excel(io.BytesIO(uploaded[file_name]), engine='openpyxl')
        print(f"Successfully loaded with openpyxl engine")
        print(f"Dataset contains {len(df)} rows")
    except Exception as e2:
        print(f"All reading attempts failed. Last error: {e2}")
        raise

# Display the first few rows to verify the format
print("\nFirst 5 rows of the dataset:")
display(df.head())
# Verify the expected columns
expected_columns = ['sentid', 'sentence', 'entities and pronouns']
missing_columns = [col for col in expected_columns if col not in df.columns]

if missing_columns:
    print(f"Warning: Missing expected columns: {missing_columns}")
    print("Available columns:", df.columns.tolist())
else:
    print("All expected columns found: 'sentid', 'sentence', 'entities and pronouns'")

# ==== Step 2: Parse the entities and pronouns ====

def parse_entities_and_pronouns(text):
    """
    Parse the 'entities and pronouns' column value to extract entity1, entity2, and pronoun.
    Expected format: "entity 1: X, entity 2: Y, pronoun: Z"
    """
    if not isinstance(text, str):
        return {'entity1': None, 'entity2': None, 'pronoun': None, 'pronoun_gender': 'unknown'}

    try:
        # Handle potential variations in format
        text = text.replace(';', ',')  # Replace semicolons with commas

        entity1_match = re.search(r'entity\s*1\s*:\s*([^,]+)', text, re.IGNORECASE)
        entity2_match = re.search(r'entity\s*2\s*:\s*([^,]+)', text, re.IGNORECASE)
        pronoun_match = re.search(r'pronoun\s*:\s*([^,]+)', text, re.IGNORECASE)

        entity1 = entity1_match.group(1).strip() if entity1_match else None
        entity2 = entity2_match.group(1).strip() if entity2_match else None
        pronoun = pronoun_match.group(1).strip() if pronoun_match else None

        # Clean up the extracted values
        for field in [entity1, entity2, pronoun]:
            if field and field.lower() in ['none', 'null', 'na', 'n/a', '-']:
                field = None

        # Determine pronoun gender
        if pronoun:
            male_markers = ["του", "τον", "ο ίδιος", "αυτός", "τον ίδιο", "δικό του", "δικός του"]
            female_markers = ["της", "την", "η ίδια", "αυτή", "την ίδια", "δική της", "δικιά της"]

            if any(marker in pronoun.lower() for marker in male_markers):
                gender = "male"
            elif any(marker in pronoun.lower() for marker in female_markers):
                gender = "female"
            else:
                gender = "unknown"
        else:
            gender = "unknown"

        return {
            'entity1': entity1,
            'entity2': entity2,
            'pronoun': pronoun,
            'pronoun_gender': gender
        }
    except Exception as e:
        print(f"Error parsing: '{text}'. Error: {e}")
        return {'entity1': None, 'entity2': None, 'pronoun': None, 'pronoun_gender': 'unknown'}

# Parse the entities and pronouns from the dataset
parsed_data = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Parsing entities and pronouns"):
    entities_text = row['entities and pronouns']

    if not isinstance(entities_text, str):
        print(f"Warning: Row {idx} has non-string value in 'entities and pronouns' column: {entities_text}")
        continue

    parsed = parse_entities_and_pronouns(entities_text)

    # Include sentid if available, otherwise create one
    sentid = row.get('sentid', f"sent_{idx}")

    parsed_data.append({
        'sentid': sentid,
        'sentence': row['sentence'],
        'entity1': parsed['entity1'],
        'entity2': parsed['entity2'],
        'pronoun': parsed['pronoun'],
        'pronoun_gender': parsed['pronoun_gender']
    })

# Create a clean dataset for testing
parsed_df = pd.DataFrame(parsed_data)

# Check for any missing values
missing_entity1 = parsed_df['entity1'].isnull().sum()
missing_entity2 = parsed_df['entity2'].isnull().sum()
missing_pronoun = parsed_df['pronoun'].isnull().sum()

print(f"Missing entity1: {missing_entity1}")
print(f"Missing entity2: {missing_entity2}")
print(f"Missing pronoun: {missing_pronoun}")

# Display sample of parsed data
print("\nSample of parsed data:")
display(parsed_df.head())

# ==== Step 3: Enter API Keys for LLMs ====

# Initialize multi-model tester
multi_tester = MultiLLMCoreferenceTest()

# Add Claude client (required)
print("Please enter your Anthropic API key:")
claude_api_key = getpass()
claude_client = ClaudeClient(claude_api_key)
multi_tester.add_client(claude_client)


openai_client = OpenAIClient()  # No API key provided, will be inactive
multi_tester.add_client(openai_client)

llama_client = LlamaClient()  # No API key provided, will be inactive
multi_tester.add_client(llama_client)

llama_krikri_client = LlamaKrikriClient()
multi_tester.add_client(llama_krikri_client)

# Optional: Activate other LLM clients with API keys
print("\nDo you have API keys for other LLMs? (y/n): ")
if input().strip().lower() == 'y':
    # OpenAI (GPT-4)
    print("\nDo you have an OpenAI API key? (y/n): ")
    if input().strip().lower() == 'y':
        print("Please enter your OpenAI API key:")
        openai_api_key = getpass()
        openai_client = OpenAIClient(openai_api_key)
        multi_tester.add_client(openai_client)

    # Llama
    print("\nDo you have an API key for Llama? (y/n): ")
    if input().strip().lower() == 'y':
        print("Please enter your API key:")
        together_api_key = getpass()
        llama_client = LlamaClient(together_api_key)
        multi_tester.add_client(llama_client)

    # LLaMA-Krikri Greek model
    print("\nDo you want to load the Greek LLaMA-Krikri model? (y/n): ")
    print("Using official ilsp/Llama-Krikri-8B-Base model via API")
    if input().strip().lower() == 'y':
      try:
        llama_krikri_client = LlamaKrikriClient()
        success = multi_tester.add_client(llama_krikri_client)
        if success:
            print("🎉 Official Greek LLaMA-Krikri model ready!")
        else:
            print("⚠️  Greek model connection failed - continuing without it")
      except Exception as e:
        print(f"⚠️  Greek model error: {e}")
        print("Continuing with other models...")


# ==== Step 4: Set up testing parameters ====

# Select LLM models
print("\nSelect LLM models to test (multiple selections possible):")
print("1. claude (Claude 3.5 Sonnet)")
print("2. openai (GPT-4o) - API key required")
print("3. llama (LLaMA-3 70B) - API key required")
print("4. llama_krikri (LLaMA-Krikri 8B)")

selected_models = input("Enter model numbers separated by commas (e.g., 1,2): ").strip()
model_map = {
    "1": "claude",
    "2": "openai",
    "3": "llama",
    "4": "llama_krikri"
}

models_to_test = []
for num in selected_models.split(','):
    num = num.strip()
    if num in model_map:
        model_name = model_map[num]
        # Only add models that are available
        if model_name in multi_tester.clients and multi_tester.clients[model_name].available:
            models_to_test.append(model_name)
        else:
            print(f"Model {model_name} not available (API key required)")

if not models_to_test:
    print("No valid models selected or available. Using claude as default.")
    if "claude" in multi_tester.clients and multi_tester.clients["claude"].available:
        models_to_test = ["claude"]
    else:
        print("ERROR: Claude client not available. Cannot proceed.")
        sys.exit(1)

print(f"Selected models: {', '.join(models_to_test)}")

# Select prompt types
print("\nSelect prompt types to test (multiple selections possible):")
print("1. zero-shot - Basic direct question")
print("2. detailed - Detailed linguistic analysis")
print("3. neutral - Neutral bias-minimizing")
print("4. cot - Chain-of-thought reasoning")
print("5. few-shot - Few-shot learning with examples")
print("6. greek - Greek language prompt (only for LLaMA-Krikri)")

selected_prompts = input("Enter prompt type numbers separated by commas (e.g., 1,2,3): ").strip()
prompt_map = {
    "1": "zero-shot",
    "2": "detailed",
    "3": "neutral",
    "4": "cot",
    "5": "few-shot",
    "6": "greek"
}

prompt_types_to_test = []
for num in selected_prompts.split(','):
    num = num.strip()
    if num in prompt_map:
        prompt_types_to_test.append(prompt_map[num])

if not prompt_types_to_test:
    print("No valid prompt types selected. Using all prompt types.")
    prompt_types_to_test = ["zero-shot", "detailed", "neutral", "cot", "few-shot"]
    if "llama_krikri" in models_to_test:
        prompt_types_to_test.append("greek")

print(f"Selected prompt types: {', '.join(prompt_types_to_test)}")

# Number of examples to test
max_examples_input = input("\nHow many examples to test? (Enter a number or 'all'): ").strip().lower()
max_examples = None
if max_examples_input != 'all':
    try:
        max_examples = int(max_examples_input)
    except:
        print("Invalid input. Testing all examples.")

if max_examples is not None:
    print(f"Testing {max_examples} examples")
else:
    print("Testing all examples")

# Check if user has gold standard data
print("\nDo you have a gold standard dataset for evaluation? (y/n): ")
use_gold = input().strip().lower() == 'y'

gold_standard_df = None
if use_gold:
    print("Please upload your gold standard Excel file...")
    gold_uploaded = files.upload()

    try:
        gold_file = list(gold_uploaded.keys())[0]
        gold_standard_df = pd.read_excel(io.BytesIO(gold_uploaded[gold_file]))
        print(f"Successfully loaded gold standard: {gold_file}")
        print(f"Gold standard contains {len(gold_standard_df)} rows")

        # Verify it has the required columns
        required_cols = ['sentid', 'correct_resolution']
        missing = [col for col in required_cols if col not in gold_standard_df.columns]

        if missing:
            print(f"Warning: Gold standard is missing required columns: {missing}")
            print("Gold standard should have 'sentid' and 'correct_resolution' columns")
            print("Using 'both' as default correct resolution for all examples")
            gold_standard_df = None
    except Exception as e:
        print(f"Error loading gold standard: {e}")
        print("Using 'both' as default correct resolution for all examples")
        gold_standard_df = None

# ==== Step 5: Run the tests with metrics ====

print("\n==== Running Pronoun Resolution Tests with Multiple LLMs ====")

# Set output directory for the multi-tester
multi_tester.output_dir = output_dir

# Process the dataset with the multi-tester
results_df, metrics_df = multi_tester.process_dataset_with_metrics(
    parsed_df,
    client_names=models_to_test,
    prompt_types=prompt_types_to_test,
    max_examples=max_examples,
    gold_standard=gold_standard_df
)

# ==== Step 6: Save results and visualize ====

print("\n==== Saving Results and Visualizing ====")

# Save results and create visualizations
multi_tester.save_results(output_dir)
multi_tester.visualize_results(output_dir)

# Analyze results
multi_tester.analyze_results(output_dir)

# ==== Step 7: Technical Evaluation Summary ====

print("\n==== Technical Evaluation Summary ====")

# Display summary of metrics
if len(metrics_df) > 0:
    print("\nOverall Performance Metrics:")
    for model in metrics_df["model"].unique():
        model_metrics = metrics_df[metrics_df["model"] == model]
        print(f"\n- Model: {model}")

        for _, row in model_metrics.iterrows():
            print(f"  Prompt: {row['prompt_type']}")
            print(f"  Accuracy: {row['accuracy']*100:.1f}%")
            if 'macro_f1' in row and not pd.isna(row['macro_f1']):
                print(f"  Macro F1: {row['macro_f1']*100:.1f}%")

    # Determine best model based on accuracy rather than macro_f1
    if 'accuracy' in metrics_df.columns:
        best_idx = metrics_df['accuracy'].idxmax()
        if best_idx is not None:
            best_prompt = metrics_df.loc[best_idx]
            print(f"\nBest performing configuration:")
            print(f"- Model: {best_prompt['model']}")
            print(f"- Prompt: {best_prompt['prompt_type']}")
            print(f"- Accuracy: {best_prompt['accuracy']*100:.1f}%")
            if 'macro_f1' in best_prompt and not pd.isna(best_prompt['macro_f1']):
                print(f"- Macro F1: {best_prompt['macro_f1']*100:.1f}%")

print("\n==== Results Summary ====")
print(f"1. Full results: {output_dir}/llm_coreference_results.csv")
print(f"2. Visualizations: {output_dir}/visualizations.zip")
print(f"3. Analysis: {output_dir}/multi_llm_analysis.txt")
print(f"4. Evaluation metrics: {output_dir}/multi_llm_metrics.csv")
print(f"5. Noun preference analysis: {output_dir}/noun_preferences/")
print(f"6. Gender bias examples: {output_dir}/noun_preferences/gender_biased_examples.csv")

# Package all results into a single zip file for easy download
!zip -r {output_dir}.zip {output_dir}
files.download(f"{output_dir}.zip")

print("\n==== Additional Findings ====")

# Find most gender-biased occupations
print("\nMost gender-biased occupations:")
try:
    # Look for occupation nouns with strong gender associations
    entity_prefs_file = f"{output_dir}/noun_preferences/entity_gender_preferences.csv"
    if os.path.exists(entity_prefs_file):
        entity_df = pd.read_csv(entity_prefs_file)

        # Get occupational terms
        occupational_terms = [
            "γιατρός", "υπάλληλος", "μηχανικός", "τεχνικός", "πυροσβέστης",
            "ηλεκτρολόγος", "υδραυλικός", "αρχιτέκτονας", "αστυνομικός", "νοσοκόμος",
            "παιδαγωγός", "φαρμακοποιός", "ψυχολόγος", "ρεσεψιονίστ", "σεφ",
            "οδηγός", "παθολόγος", "διατολόγος", "τεχνίτης", "χειρουργός",
            "μηχανικός ηλεκτρονικών υπολογιστών", "επιστήμονας", "χημικός", "φυσικός"
        ]

        # Filter for occupational entities
        occupation_df = entity_df[entity_df["entity_value"].apply(lambda x: any(term in str(x).lower() for term in occupational_terms))]

        if len(occupation_df) > 0:
            # Create pivot for gender comparison
            gender_pivot = occupation_df.pivot_table(
                index="entity_value",
                columns="pronoun_gender",
                values="occurrence_count",
                aggfunc="sum"
            ).fillna(0)

            # Calculate gender bias score (difference between male and female)
            if "male" in gender_pivot.columns and "female" in gender_pivot.columns:
                gender_pivot["bias_score"] = abs(gender_pivot["male"] - gender_pivot["female"]) / (gender_pivot["male"] + gender_pivot["female"]) * 100

                # Get top biased occupations
                top_biased = gender_pivot.sort_values("bias_score", ascending=False).head(10)

                for entity, row in top_biased.iterrows():
                    male_count = row.get("male", 0)
                    female_count = row.get("female", 0)
                    bias = row.get("bias_score", 0)
                    preference = "male" if male_count > female_count else "female"

                    print(f"- {entity}: {bias:.1f}% bias towards {preference} pronouns ({male_count} male vs {female_count} female)")
        else:
            print("No occupation data found for bias analysis")
    else:
        print("Entity preferences file not found")
except Exception as e:
    print(f"Error analyzing occupation biases: {e}")

# Compare performance across models
print("\nOverall model performance comparison:")
try:
    if len(metrics_df) > 0:
        # Get average accuracy by model
        model_accuracy = metrics_df.groupby("model")["accuracy"].mean()
        best_model = model_accuracy.idxmax()

        print(f"Highest overall accuracy: {best_model} ({model_accuracy[best_model]*100:.1f}%)")

        # Get best prompt type for each model
        for model in metrics_df["model"].unique():
            model_metrics = metrics_df[metrics_df["model"] == model]
            best_prompt = model_metrics.loc[model_metrics["accuracy"].idxmax()]

            print(f"Best prompt for {model}: {best_prompt['prompt_type']} ({best_prompt['accuracy']*100:.1f}% accuracy)")

        # Specifically highlight Greek-optimized prompt performance
        if "greek" in metrics_df["prompt_type"].unique() and "llama_krikri" in metrics_df["model"].unique():
            greek_perf = metrics_df[(metrics_df["model"] == "llama_krikri") & (metrics_df["prompt_type"] == "greek")]

            if len(greek_perf) > 0:
                greek_accuracy = greek_perf["accuracy"].values[0]

                # Compare to other prompts for llama_krikri
                other_prompts = metrics_df[(metrics_df["model"] == "llama_krikri") & (metrics_df["prompt_type"] != "greek")]
                other_avg = other_prompts["accuracy"].mean() if len(other_prompts) > 0 else 0

                diff = greek_accuracy - other_avg
                better_worse = "better" if diff > 0 else "worse"

                print(f"\nGreek-optimized prompt for LLaMA-Krikri: {greek_accuracy*100:.1f}% accuracy")
                print(f"This is {abs(diff)*100:.1f}% {better_worse} than other prompts for LLaMA-Krikri")
except Exception as e:
    print(f"Error comparing model performance: {e}")


print("The complete results are available in the downloaded zip file.")

Please upload your Excel file containing the Greek pronoun dataset...


KeyboardInterrupt: 

In [None]:
# ==== Main Testing Script ====

# ==== Step 1: Upload and Parse Dataset ====

print("Please upload your Excel file containing the Greek pronoun dataset...")
uploaded = files.upload()

# Get the filename of the uploaded file
file_name = list(uploaded.keys())[0]

# Read the Excel file
try:
    df = pd.read_excel(io.BytesIO(uploaded[file_name]))
    print(f"Successfully loaded file: {file_name}")
    print(f"Dataset contains {len(df)} rows")
except Exception as e:
    print(f"Error with default reader: {e}")
    try:
        df = pd.read_excel(io.BytesIO(uploaded[file_name]), engine='openpyxl')
        print(f"Successfully loaded with openpyxl engine")
        print(f"Dataset contains {len(df)} rows")
    except Exception as e2:
        print(f"All reading attempts failed. Last error: {e2}")
        raise

# Display the first few rows to verify the format
print("\nFirst 5 rows of the dataset:")
display(df.head())
# Verify the expected columns
expected_columns = ['sentid', 'sentence', 'entities and pronouns']
missing_columns = [col for col in expected_columns if col not in df.columns]

if missing_columns:
    print(f"Warning: Missing expected columns: {missing_columns}")
    print("Available columns:", df.columns.tolist())
else:
    print("All expected columns found: 'sentid', 'sentence', 'entities and pronouns'")

# ==== Step 2: Parse the entities and pronouns ====

def parse_entities_and_pronouns(text):
    """
    Parse the 'entities and pronouns' column value to extract entity1, entity2, and pronoun.
    Expected format: "entity 1: X, entity 2: Y, pronoun: Z"
    """
    if not isinstance(text, str):
        return {'entity1': None, 'entity2': None, 'pronoun': None, 'pronoun_gender': 'unknown'}

    try:
        # Handle potential variations in format
        text = text.replace(';', ',')  # Replace semicolons with commas

        entity1_match = re.search(r'entity\s*1\s*:\s*([^,]+)', text, re.IGNORECASE)
        entity2_match = re.search(r'entity\s*2\s*:\s*([^,]+)', text, re.IGNORECASE)
        pronoun_match = re.search(r'pronoun\s*:\s*([^,]+)', text, re.IGNORECASE)

        entity1 = entity1_match.group(1).strip() if entity1_match else None
        entity2 = entity2_match.group(1).strip() if entity2_match else None
        pronoun = pronoun_match.group(1).strip() if pronoun_match else None

        # Clean up the extracted values
        for field in [entity1, entity2, pronoun]:
            if field and field.lower() in ['none', 'null', 'na', 'n/a', '-']:
                field = None

        # Determine pronoun gender
        if pronoun:
            male_markers = ["του", "τον", "ο ίδιος", "αυτός", "τον ίδιο", "δικό του", "δικός του"]
            female_markers = ["της", "την", "η ίδια", "αυτή", "την ίδια", "δική της", "δικιά της"]

            if any(marker in pronoun.lower() for marker in male_markers):
                gender = "male"
            elif any(marker in pronoun.lower() for marker in female_markers):
                gender = "female"
            else:
                gender = "unknown"
        else:
            gender = "unknown"

        return {
            'entity1': entity1,
            'entity2': entity2,
            'pronoun': pronoun,
            'pronoun_gender': gender
        }
    except Exception as e:
        print(f"Error parsing: '{text}'. Error: {e}")
        return {'entity1': None, 'entity2': None, 'pronoun': None, 'pronoun_gender': 'unknown'}

# Parse the entities and pronouns from the dataset
parsed_data = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Parsing entities and pronouns"):
    entities_text = row['entities and pronouns']

    if not isinstance(entities_text, str):
        print(f"Warning: Row {idx} has non-string value in 'entities and pronouns' column: {entities_text}")
        continue

    parsed = parse_entities_and_pronouns(entities_text)

    # Include sentid if available, otherwise create one
    sentid = row.get('sentid', f"sent_{idx}")

    parsed_data.append({
        'sentid': sentid,
        'sentence': row['sentence'],
        'entity1': parsed['entity1'],
        'entity2': parsed['entity2'],
        'pronoun': parsed['pronoun'],
        'pronoun_gender': parsed['pronoun_gender']
    })

# Create a clean dataset for testing
parsed_df = pd.DataFrame(parsed_data)

# Check for any missing values
missing_entity1 = parsed_df['entity1'].isnull().sum()
missing_entity2 = parsed_df['entity2'].isnull().sum()
missing_pronoun = parsed_df['pronoun'].isnull().sum()

print(f"Missing entity1: {missing_entity1}")
print(f"Missing entity2: {missing_entity2}")
print(f"Missing pronoun: {missing_pronoun}")

# Display sample of parsed data
print("\nSample of parsed data:")
display(parsed_df.head())

# ==== Step 3: Enter API Keys for LLMs ====

# Initialize multi-model tester
multi_tester = MultiLLMCoreferenceTest()

# Add Claude client (required)
print("Please enter your Anthropic API key:")
claude_api_key = getpass()
claude_client = ClaudeClient(claude_api_key)
multi_tester.add_client(claude_client)


openai_client = OpenAIClient()  # No API key provided, will be inactive
multi_tester.add_client(openai_client)

llama_client = LlamaClient()  # No API key provided, will be inactive
multi_tester.add_client(llama_client)

llama_krikri_client = LlamaKrikriClient()
multi_tester.add_client(llama_krikri_client)

# Optional: Activate other LLM clients with API keys
print("\nDo you have API keys for other LLMs? (y/n): ")
if input().strip().lower() == 'y':
    # OpenAI (GPT-4)
    print("\nDo you have an OpenAI API key? (y/n): ")
    if input().strip().lower() == 'y':
        print("Please enter your OpenAI API key:")
        openai_api_key = getpass()
        openai_client = OpenAIClient(openai_api_key)
        multi_tester.add_client(openai_client)

    # Llama
    print("\nDo you have an API key for Llama? (y/n): ")
    if input().strip().lower() == 'y':
        print("Please enter your API key:")
        together_api_key = getpass()
        llama_client = LlamaClient(together_api_key)
        multi_tester.add_client(llama_client)

    # LLaMA-Krikri Greek model
    print("\nDo you want to load the Greek LLaMA-Krikri model? (y/n): ")
    print("Using official ilsp/Llama-Krikri-8B-Base model via API")
    if input().strip().lower() == 'y':
      try:
        llama_krikri_client = LlamaKrikriClient()
        success = multi_tester.add_client(llama_krikri_client)
        if success:
            print("🎉 Official Greek LLaMA-Krikri model ready!")
        else:
            print("⚠️  Greek model connection failed - continuing without it")
      except Exception as e:
        print(f"⚠️  Greek model error: {e}")
        print("Continuing with other models...")


# ==== Step 4: Set up testing parameters ====

# Select LLM models
print("\nSelect LLM models to test (multiple selections possible):")
print("1. claude (Claude 3.5 Sonnet)")
print("2. openai (GPT-4o) - API key required")
print("3. llama (LLaMA-3 70B) - API key required")
print("4. llama_krikri (LLaMA-Krikri 8B)")

selected_models = input("Enter model numbers separated by commas (e.g., 1,2): ").strip()
model_map = {
    "1": "claude",
    "2": "openai",
    "3": "llama",
    "4": "llama_krikri"
}

models_to_test = []
for num in selected_models.split(','):
    num = num.strip()
    if num in model_map:
        model_name = model_map[num]
        # Only add models that are available
        if model_name in multi_tester.clients and multi_tester.clients[model_name].available:
            models_to_test.append(model_name)
        else:
            print(f"Model {model_name} not available (API key required)")

if not models_to_test:
    print("No valid models selected or available. Using claude as default.")
    if "claude" in multi_tester.clients and multi_tester.clients["claude"].available:
        models_to_test = ["claude"]
    else:
        print("ERROR: Claude client not available. Cannot proceed.")
        sys.exit(1)

print(f"Selected models: {', '.join(models_to_test)}")

# Select prompt types
print("\nSelect prompt types to test (multiple selections possible):")
print("1. zero-shot - Basic direct question")
print("2. detailed - Detailed linguistic analysis")
print("3. neutral - Neutral bias-minimizing")
print("4. cot - Chain-of-thought reasoning")
print("5. few-shot - Few-shot learning with examples")
print("6. greek - Greek language prompt (only for LLaMA-Krikri)")

selected_prompts = input("Enter prompt type numbers separated by commas (e.g., 1,2,3): ").strip()
prompt_map = {
    "1": "zero-shot",
    "2": "detailed",
    "3": "neutral",
    "4": "cot",
    "5": "few-shot",
    "6": "greek"
}

prompt_types_to_test = []
for num in selected_prompts.split(','):
    num = num.strip()
    if num in prompt_map:
        prompt_types_to_test.append(prompt_map[num])

if not prompt_types_to_test:
    print("No valid prompt types selected. Using all prompt types.")
    prompt_types_to_test = ["zero-shot", "detailed", "neutral", "cot", "few-shot"]
    if "llama_krikri" in models_to_test:
        prompt_types_to_test.append("greek")

print(f"Selected prompt types: {', '.join(prompt_types_to_test)}")

# Number of examples to test
max_examples_input = input("\nHow many examples to test? (Enter a number or 'all'): ").strip().lower()
max_examples = None
if max_examples_input != 'all':
    try:
        max_examples = int(max_examples_input)
    except:
        print("Invalid input. Testing all examples.")

if max_examples is not None:
    print(f"Testing {max_examples} examples")
else:
    print("Testing all examples")

# Check if user has gold standard data
print("\nDo you have a gold standard dataset for evaluation? (y/n): ")
use_gold = input().strip().lower() == 'y'

gold_standard_df = None
if use_gold:
    print("Please upload your gold standard Excel file...")
    gold_uploaded = files.upload()

    try:
        gold_file = list(gold_uploaded.keys())[0]
        gold_standard_df = pd.read_excel(io.BytesIO(gold_uploaded[gold_file]))
        print(f"Successfully loaded gold standard: {gold_file}")
        print(f"Gold standard contains {len(gold_standard_df)} rows")

        # Verify it has the required columns
        required_cols = ['sentid', 'correct_resolution']
        missing = [col for col in required_cols if col not in gold_standard_df.columns]

        if missing:
            print(f"Warning: Gold standard is missing required columns: {missing}")
            print("Gold standard should have 'sentid' and 'correct_resolution' columns")
            print("Using 'both' as default correct resolution for all examples")
            gold_standard_df = None
    except Exception as e:
        print(f"Error loading gold standard: {e}")
        print("Using 'both' as default correct resolution for all examples")
        gold_standard_df = None

# ==== Step 5: Run the tests with metrics ====

print("\n==== Running Pronoun Resolution Tests with Multiple LLMs ====")

# Set output directory for the multi-tester
multi_tester.output_dir = output_dir

# Process the dataset with the multi-tester
results_df, metrics_df = multi_tester.process_dataset_with_metrics(
    parsed_df,
    client_names=models_to_test,
    prompt_types=prompt_types_to_test,
    max_examples=max_examples,
    gold_standard=gold_standard_df
)

# ==== Step 6: Save results and visualize ====

print("\n==== Saving Results and Visualizing ====")

# Save results and create visualizations
multi_tester.save_results(output_dir)
multi_tester.visualize_results(output_dir)

# Analyze results
multi_tester.analyze_results(output_dir)

# ==== Step 7: Technical Evaluation Summary ====

print("\n==== Technical Evaluation Summary ====")

# Display summary of metrics
if len(metrics_df) > 0:
    print("\nOverall Performance Metrics:")
    for model in metrics_df["model"].unique():
        model_metrics = metrics_df[metrics_df["model"] == model]
        print(f"\n- Model: {model}")

        for _, row in model_metrics.iterrows():
            print(f"  Prompt: {row['prompt_type']}")
            print(f"  Accuracy: {row['accuracy']*100:.1f}%")
            if 'macro_f1' in row and not pd.isna(row['macro_f1']):
                print(f"  Macro F1: {row['macro_f1']*100:.1f}%")

    # Determine best model based on accuracy rather than macro_f1
    if 'accuracy' in metrics_df.columns:
        best_idx = metrics_df['accuracy'].idxmax()
        if best_idx is not None:
            best_prompt = metrics_df.loc[best_idx]
            print(f"\nBest performing configuration:")
            print(f"- Model: {best_prompt['model']}")
            print(f"- Prompt: {best_prompt['prompt_type']}")
            print(f"- Accuracy: {best_prompt['accuracy']*100:.1f}%")
            if 'macro_f1' in best_prompt and not pd.isna(best_prompt['macro_f1']):
                print(f"- Macro F1: {best_prompt['macro_f1']*100:.1f}%")

print("\n==== Results Summary ====")
print(f"1. Full results: {output_dir}/llm_coreference_results.csv")
print(f"2. Visualizations: {output_dir}/visualizations.zip")
print(f"3. Analysis: {output_dir}/multi_llm_analysis.txt")
print(f"4. Evaluation metrics: {output_dir}/multi_llm_metrics.csv")
print(f"5. Noun preference analysis: {output_dir}/noun_preferences/")
print(f"6. Gender bias examples: {output_dir}/noun_preferences/gender_biased_examples.csv")

# Package all results into a single zip file for easy download
!zip -r {output_dir}.zip {output_dir}
files.download(f"{output_dir}.zip")

print("\n==== Additional Findings ====")

# Find most gender-biased occupations
print("\nMost gender-biased occupations:")
try:
    # Look for occupation nouns with strong gender associations
    entity_prefs_file = f"{output_dir}/noun_preferences/entity_gender_preferences.csv"
    if os.path.exists(entity_prefs_file):
        entity_df = pd.read_csv(entity_prefs_file)

        # Get occupational terms
        occupational_terms = [
            "γιατρός", "υπάλληλος", "μηχανικός", "τεχνικός", "πυροσβέστης",
            "ηλεκτρολόγος", "υδραυλικός", "αρχιτέκτονας", "αστυνομικός", "νοσοκόμος",
            "παιδαγωγός", "φαρμακοποιός", "ψυχολόγος", "ρεσεψιονίστ", "σεφ",
            "οδηγός", "παθολόγος", "διατολόγος", "τεχνίτης", "χειρουργός",
            "μηχανικός ηλεκτρονικών υπολογιστών", "επιστήμονας", "χημικός", "φυσικός"
        ]

        # Filter for occupational entities
        occupation_df = entity_df[entity_df["entity_value"].apply(lambda x: any(term in str(x).lower() for term in occupational_terms))]

        if len(occupation_df) > 0:
            # Create pivot for gender comparison
            gender_pivot = occupation_df.pivot_table(
                index="entity_value",
                columns="pronoun_gender",
                values="occurrence_count",
                aggfunc="sum"
            ).fillna(0)

            # Calculate gender bias score (difference between male and female)
            if "male" in gender_pivot.columns and "female" in gender_pivot.columns:
                gender_pivot["bias_score"] = abs(gender_pivot["male"] - gender_pivot["female"]) / (gender_pivot["male"] + gender_pivot["female"]) * 100

                # Get top biased occupations
                top_biased = gender_pivot.sort_values("bias_score", ascending=False).head(10)

                for entity, row in top_biased.iterrows():
                    male_count = row.get("male", 0)
                    female_count = row.get("female", 0)
                    bias = row.get("bias_score", 0)
                    preference = "male" if male_count > female_count else "female"

                    print(f"- {entity}: {bias:.1f}% bias towards {preference} pronouns ({male_count} male vs {female_count} female)")
        else:
            print("No occupation data found for bias analysis")
    else:
        print("Entity preferences file not found")
except Exception as e:
    print(f"Error analyzing occupation biases: {e}")

# Compare performance across models
print("\nOverall model performance comparison:")
try:
    if len(metrics_df) > 0:
        # Get average accuracy by model
        model_accuracy = metrics_df.groupby("model")["accuracy"].mean()
        best_model = model_accuracy.idxmax()

        print(f"Highest overall accuracy: {best_model} ({model_accuracy[best_model]*100:.1f}%)")

        # Get best prompt type for each model
        for model in metrics_df["model"].unique():
            model_metrics = metrics_df[metrics_df["model"] == model]
            best_prompt = model_metrics.loc[model_metrics["accuracy"].idxmax()]

            print(f"Best prompt for {model}: {best_prompt['prompt_type']} ({best_prompt['accuracy']*100:.1f}% accuracy)")

        # Specifically highlight Greek-optimized prompt performance
        if "greek" in metrics_df["prompt_type"].unique() and "llama_krikri" in metrics_df["model"].unique():
            greek_perf = metrics_df[(metrics_df["model"] == "llama_krikri") & (metrics_df["prompt_type"] == "greek")]

            if len(greek_perf) > 0:
                greek_accuracy = greek_perf["accuracy"].values[0]

                # Compare to other prompts for llama_krikri
                other_prompts = metrics_df[(metrics_df["model"] == "llama_krikri") & (metrics_df["prompt_type"] != "greek")]
                other_avg = other_prompts["accuracy"].mean() if len(other_prompts) > 0 else 0

                diff = greek_accuracy - other_avg
                better_worse = "better" if diff > 0 else "worse"

                print(f"\nGreek-optimized prompt for LLaMA-Krikri: {greek_accuracy*100:.1f}% accuracy")
                print(f"This is {abs(diff)*100:.1f}% {better_worse} than other prompts for LLaMA-Krikri")
except Exception as e:
    print(f"Error comparing model performance: {e}")


print("The complete results are available in the downloaded zip file.")

Please upload your Excel file containing the Greek pronoun dataset...


KeyboardInterrupt: 