In [1]:
import pandas as pd
import json
import re
import numpy as np
from collections import Counter

In [2]:
JUDGE_FILES = {
    'claude': '../../data/judged/sample_pt_claude_judge_lingual.csv',
    'prometheus': '../../data/judged/sample_pt_prometheus_judge_lingual.csv',
    'mistral': '../../data/judged/sample_pt_mistral_judge_lingual.csv'
}

In [3]:
primeiro_judge = list(JUDGE_FILES.keys())[0]
primeiro_arquivo = JUDGE_FILES[primeiro_judge]

print(f"\nCarregando arquivo base: {primeiro_judge}")
df_analysis = pd.read_csv(primeiro_arquivo)
print(f"✓ Carregado: {len(df_analysis)} registros")


Carregando arquivo base: claude
✓ Carregado: 500 registros


In [4]:
df_analysis.head()

Unnamed: 0,evaluation_id,response_A,response_B,evaluation_claude
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,"{\n ""winner"": ""A"",\n ""general_justificat..."
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,**Notação Big O: uma medida de complexidade**\...,A notação Big O é uma medida de complexidade d...,"{\n ""winner"": ""B"",\n ""general_justificat..."
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,A Proclamação da República no Brasil em 15 de ...,"A Proclamação da República no Brasil, em 15 de...","{\n ""winner"": ""B"",\n ""general_justificat..."
3,CR019_gpt-4o_3_Creative_detailed_en_vs_pt,"Título: ""A Bússola do Coração""\n\nConceito:\n\...",**Título: Bússola do Coração**\n\n**Conceito:*...,"{\n ""winner"": ""B"",\n ""general_justificat..."
4,TC007_llama-3.3-70b-versatile_4_Technical_stru...,A compreensão de lista em Python é uma maneira...,"Resposta: ""Uma 'list comprehension' em Python ...","{\n ""winner"": ""B"",\n ""general_justificat..."


In [5]:
for judge_name, filename in list(JUDGE_FILES.items())[1:]:
    try:
        df_judge = pd.read_csv(filename, usecols=['evaluation_id', f'evaluation_{judge_name}'])
        df_analysis = pd.merge(df_analysis, df_judge, on='evaluation_id', how='left')
        print(f"Data from judge '{judge_name}' successfully merged.")
    except Exception as e:
        print(f"ERROR processing the file for '{judge_name}': {e}")

print("\nShape of the DataFrame after merging:", df_analysis.shape)
print("Sample of raw data:")
df_analysis.head(3)

Data from judge 'prometheus' successfully merged.
Data from judge 'mistral' successfully merged.

Shape of the DataFrame after merging: (500, 6)
Sample of raw data:


Unnamed: 0,evaluation_id,response_A,response_B,evaluation_claude,evaluation_prometheus,evaluation_mistral
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,"{\n ""winner"": ""A"",\n ""general_justificat...","{\n""winner"": ""T"",\n""general_justification"": ""T...","{\n ""winner"": ""A"",\n ""general_j..."
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,**Notação Big O: uma medida de complexidade**\...,A notação Big O é uma medida de complexidade d...,"{\n ""winner"": ""B"",\n ""general_justificat...","{\n""winner"": ""Tie"",\n""general_justification"": ...","{\n ""winner"": ""A"",\n ""general_j..."
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,A Proclamação da República no Brasil em 15 de ...,"A Proclamação da República no Brasil, em 15 de...","{\n ""winner"": ""B"",\n ""general_justificat...","{\n""winner"": ""T"",\n""general_justification"": ""B...","{\n ""winner"": ""A"",\n ""general_j..."


### PROCESS CLAUDE EVALUATIONS

In [6]:
df_analysis['claude_winner'] = None
df_analysis['claude_total_score'] = np.nan

In [7]:
def extract_json_from_markdown(text):
    """Extrai JSON de blocos markdown ```json...```"""
    if pd.isna(text):
        return None

    match = re.search(r'```json\s*(\{.*?\})\s*```', str(text), re.DOTALL)
    if match:
        return match.group(1)

    return text

In [8]:
for index, row in df_analysis.iterrows():
    try:
        # Extract JSON from markdown first
        json_text = extract_json_from_markdown(row['evaluation_claude'])
        
        if json_text is None:
            raise ValueError("Empty evaluation")
        
        data = json.loads(json_text)
        winner = data.get('winner')
        criteria = data.get('criteria', {})
        df_analysis.loc[index, 'claude_winner'] = winner

        scores = []
        if criteria:
            for crit_details in criteria.values():
                if 'score' in crit_details:
                    scores.append(crit_details['score'])
                elif 'score_a' in crit_details:
                    scores.append(crit_details['score_a'])
                elif 'score_b' in crit_details:
                    scores.append(crit_details['score_b'])
                elif 'score_A' in crit_details:
                    scores.append(crit_details['score_A'])
                elif 'score_B' in crit_details:
                    scores.append(crit_details['score_B'])

        if scores:
            df_analysis.loc[index, 'claude_total_score'] = np.mean(scores)
        else:
            df_analysis.loc[index, 'claude_total_score'] = 0
    except (TypeError, json.JSONDecodeError, ValueError) as e:
        df_analysis.loc[index, 'claude_winner'] = 'Parsing Error'
        df_analysis.loc[index, 'claude_total_score'] = 0

print("Claude processing completed.")
print("Sample results for Claude:")
df_analysis[['evaluation_id', 'claude_winner', 'claude_total_score']].head()

Claude processing completed.
Sample results for Claude:


Unnamed: 0,evaluation_id,claude_winner,claude_total_score
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,A,4.5
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,B,4.75
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,B,5.0
3,CR019_gpt-4o_3_Creative_detailed_en_vs_pt,B,4.5
4,TC007_llama-3.3-70b-versatile_4_Technical_stru...,B,3.75


In [9]:
score_zero_count = (df_analysis['claude_total_score'] == 0).sum()
print(f"Count where 'claude_total_score' == 0: {score_zero_count}")

winner_nan_count = df_analysis['claude_winner'].isna().sum()
print(f"Count where 'claude_winner' is NaN: {winner_nan_count}")

parsing_error_count = (df_analysis['claude_winner'] == 'Parsing Error').sum()
print(f"Count where 'claude_winner' == 'Parsing Error': {parsing_error_count}")

Count where 'claude_total_score' == 0: 0
Count where 'claude_winner' is NaN: 0
Count where 'claude_winner' == 'Parsing Error': 0


### PROCESS MISTRAL EVALUATIONS

In [10]:
df_analysis['mistral_winner'] = None
df_analysis['mistral_total_score'] = np.nan

In [11]:
for index, row in df_analysis.iterrows():
    eval_text = str(row['evaluation_mistral'])
    
    json_list = []
    
    try:
        # ATTEMPT 1: Try to read as a single valid JSON
        data = json.loads(eval_text)
        json_list.append(data)

    except (TypeError, json.JSONDecodeError):
        try:
            # ATTEMPT 2: If the first fails, fix text using a SPACE separator
            fixed_text = '[' + re.sub(r'\}\s+\{', '}, {', eval_text) + ']'
            json_list = json.loads(fixed_text)
        except (TypeError, json.JSONDecodeError):
            try:
                # ATTEMPT 3: Split JSONs using a more specific pattern
                # Look for patterns like "} [RESPONSE B] {" or "}\n\n{"
                fixed_text = re.sub(r'\}\s*\[RESPONSE\s+[A-Z]\]\s*\{', '}, {', eval_text, flags=re.IGNORECASE)
                fixed_text = '[' + re.sub(r'\}\s*\n\s*\{', '}, {', fixed_text) + ']'
                json_list = json.loads(fixed_text)
            except (TypeError, json.JSONDecodeError):
                try:
                    # ATTEMPT 4: Use a more generic JSON separator
                    fixed_text = '[' + re.sub(r'\}\s*\{', '}, {', eval_text) + ']'
                    json_list = json.loads(fixed_text)
                except (TypeError, json.JSONDecodeError):
                    try:
                        # ATTEMPT 5: Extract valid JSONs using brace counting
                        json_objects = []
                        i = 0
                        
                        while i < len(eval_text):
                            if eval_text[i] == '{':
                                brace_count = 0
                                start = i
                                in_string = False
                                escape = False
                                
                                while i < len(eval_text):
                                    char = eval_text[i]
                                    
                                    if char == '"' and not escape:
                                        in_string = not in_string
                                    elif char == '\\' and not escape:
                                        escape = True
                                        i += 1
                                        continue
                                    
                                    if not in_string:
                                        if char == '{':
                                            brace_count += 1
                                        elif char == '}':
                                            brace_count -= 1
                                            if brace_count == 0:
                                                json_str = eval_text[start:i+1]
                                                try:
                                                    obj = json.loads(json_str)
                                                    if 'winner' in obj and 'criteria' in obj:
                                                        json_objects.append(obj)
                                                except json.JSONDecodeError:
                                                    pass
                                                break
                                    
                                    escape = False
                                    i += 1
                            i += 1
                        
                        if json_objects:
                            json_list = json_objects
                        else:
                            raise json.JSONDecodeError("No valid JSON found", eval_text, 0)
                            
                    except (TypeError, json.JSONDecodeError):
                        try:
                            # ATTEMPT 6: Handle truncated JSON by finding last valid one
                            json_objects = []
                            
                            for end_pos in range(len(eval_text) - 1, -1, -1):
                                if eval_text[end_pos] == '}':
                                    test_text = eval_text[:end_pos + 1]
                                    
                                    brace_count = 0
                                    for start_pos in range(end_pos, -1, -1):
                                        if test_text[start_pos] == '}':
                                            brace_count += 1
                                        elif test_text[start_pos] == '{':
                                            brace_count -= 1
                                            
                                            if brace_count == 0:
                                                json_str = test_text[start_pos:end_pos + 1]
                                                try:
                                                    obj = json.loads(json_str)
                                                    if isinstance(obj, dict) and 'winner' in obj and 'criteria' in obj:
                                                        json_objects.append(obj)
                                                        break
                                                except json.JSONDecodeError:
                                                    continue
                            
                            if json_objects:
                                json_list = json_objects
                            else:
                                raise json.JSONDecodeError("No valid JSON found", eval_text, 0)
                                
                        except (TypeError, json.JSONDecodeError):
                            try:
                                # ATTEMPT 7: Fix unbalanced braces in truncated JSON
                                open_braces = eval_text.count('{')
                                close_braces = eval_text.count('}')
                                if open_braces > close_braces:
                                    fixed_text = eval_text + ('}' * (open_braces - close_braces))
                                elif close_braces > open_braces:
                                    fixed_text = eval_text[:-(close_braces - open_braces)]
                                else:
                                    fixed_text = eval_text

                                data = json.loads(fixed_text)
                                if isinstance(data, dict) and 'winner' in data and 'criteria' in data:
                                    json_list.append(data)
                                elif isinstance(data, list):
                                    json_list.extend(data)

                            except (TypeError, json.JSONDecodeError):
                                try:
                                    # ATTEMPT 8: Final fallback — fix broken quotes inside JSON strings
                                    fixed_text = eval_text

                                    # Escape internal double quotes
                                    fixed_text = re.sub(r'(?<!\\)"(?![:,}\]\s])', r'\\"', fixed_text)
                                    # Fix duplicated quotes
                                    fixed_text = re.sub(r'\\"{2,}', r'\\"', fixed_text)
                                    fixed_text = re.sub(r'""', '"', fixed_text)
                                    # Remove invalid commas between quotes
                                    fixed_text = re.sub(r'"\s*,\s*"', ' ', fixed_text)
                                    # Remove unnecessary line breaks
                                    fixed_text = fixed_text.replace('\n', ' ').replace('\r', ' ')
                                    # Balance braces
                                    open_braces = fixed_text.count('{')
                                    close_braces = fixed_text.count('}')
                                    if open_braces > close_braces:
                                        fixed_text += '}' * (open_braces - close_braces)
                                    # Balance quotes
                                    double_quotes = fixed_text.count('"')
                                    if double_quotes % 2 != 0:
                                        fixed_text += '"'

                                    # Final parse attempt
                                    data = json.loads(fixed_text)

                                    if isinstance(data, dict) and 'winner' in data and 'criteria' in data:
                                        json_list.append(data)
                                    elif isinstance(data, list):
                                        json_list.extend(data)

                                except (TypeError, json.JSONDecodeError) as e:
                                    print(f"Final failure at index {index}: {e}")
                                    df_analysis.loc[index, 'mistral_winner'] = 'Parsing Error'
                                    df_analysis.loc[index, 'mistral_total_score'] = 0
                                    continue

    results = []
    
    for data in json_list:
        winner = data.get('winner')
        criteria = data.get('criteria', {})
        
        scores = []
        if criteria:
            for crit_details in criteria.values():
                score = (
                    crit_details.get('score') or 
                    crit_details.get('score_A') or 
                    crit_details.get('score_a') or 
                    crit_details.get('score_B') or 
                    crit_details.get('score_b')
                )
                if score is not None:
                    try:
                        scores.append(float(score))
                    except (ValueError, TypeError):
                        continue
        
        avg_score = np.mean(scores) if scores else 0
        
        if winner and avg_score > 0:
            results.append((winner, avg_score))

    # Select the result with the highest score
    if results:
        best_winner, best_score = max(results, key=lambda item: item[1])
        df_analysis.loc[index, 'mistral_winner'] = best_winner
        df_analysis.loc[index, 'mistral_total_score'] = best_score
    else:
        df_analysis.loc[index, 'mistral_winner'] = 'Parsing Error (No valid JSON found)'
        df_analysis.loc[index, 'mistral_total_score'] = 0


print("Mistral processing completed.")
print("\nSample results for Mistral:")
df_analysis[['evaluation_id', 'mistral_winner', 'mistral_total_score']].head(5)

Mistral processing completed.

Sample results for Mistral:


Unnamed: 0,evaluation_id,mistral_winner,mistral_total_score
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,A,4.75
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,A,5.0
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,A,4.75
3,CR019_gpt-4o_3_Creative_detailed_en_vs_pt,A,4.25
4,TC007_llama-3.3-70b-versatile_4_Technical_stru...,A,5.0


In [12]:
score_zero_count = (df_analysis['mistral_total_score'] == 0).sum()
print(f"Count where 'mistral_total_score' == 0: {score_zero_count}")

winner_nan_count = df_analysis['mistral_winner'].isna().sum()
print(f"Count where 'mistral_winner' is NaN: {winner_nan_count}")

parsing_error_count = (df_analysis['mistral_winner'] == 'Parsing Error').sum()
print(f"Count where 'mistral_winner' == 'Parsing Error': {parsing_error_count}")

Count where 'mistral_total_score' == 0: 0
Count where 'mistral_winner' is NaN: 0
Count where 'mistral_winner' == 'Parsing Error': 0


### PROCESS PROMETHEUS EVALUATIONS

In [13]:
df_analysis['prometheus_winner'] = None
df_analysis['prometheus_total_score'] = np.nan

In [14]:
import re
import json
import numpy as np

score_patterns = [
    r'"score"\s*:\s*(\d+)',
    r'score:\s*(\d+)',
    r'"score"\s*(\d+)',
    r'"logical_coherence"\s*:\s*(\d+)',
    r'"relevance_and_focus"\s*:\s*(\d+)',
    r'"accuracy_and_truthfulness"\s*:\s*(\d+)',
    r'"conciseness_and_clarity"\s*:\s*(\d+)',
    r'Logical Coherence:\s*(\d+)',
    r'Relevance and Focus:\s*(\d+)',
    r'Accuracy and Truthfulness:\s*(\d+)',
    r'Conciseness and Clarity:\s*(\d+)',
    r'\*\s*Logical Coherence:\s*(\d+)',
    r'\*\s*Relevance and Focus:\s*(\d+)',
    r'\*\s*Accuracy and Truthfulness:\s*(\d+)',
    r'\*\s*Conciseness and Clarity:\s*(\d+)',
    r'logical_coherence["\']?\s*:\s*\{\s*["\']?score["\']?\s*:\s*(\d+)',
    r'relevance_and_focus["\']?\s*:\s*\{\s*["\']?score["\']?\s*:\s*(\d+)',
    r'accuracy_and_truthfulness["\']?\s*:\s*\{\s*["\']?score["\']?\s*:\s*(\d+)',
    r'conciseness_and_clarity["\']?\s*:\s*\{\s*["\']?score["\']?\s*:\s*(\d+)',
    r'scores?\s+(\d+)',
    r'\(0-5\):\s*Response [AB] scores?\s+(\d+)',
]

for index, row in df_analysis.iterrows():
    eval_text = str(row['evaluation_prometheus'])
    
    # --- FIRST ATTEMPT: COMPLETE JSON ---
    json_parsed_successfully = False
    try:
        data = None
        # Try loading the entire string first. This is the cleanest case.
        try:
            data = json.loads(eval_text)
        except json.JSONDecodeError:
            # If it fails, search for the JSON substring more intelligently
            start = eval_text.find('{')
            end = eval_text.rfind('}') + 1
            if start != -1 and end != 0:
                json_str = eval_text[start:end]
                data = json.loads(json_str)

        # If 'data' was successfully loaded either way
        if data:
            winner_val = data.get('winner', 'Parsing Error')

            if winner_val == 'Parsing Error':
                match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', eval_text, re.IGNORECASE)
                if match:
                    winner_val = match.group(1)

            if isinstance(winner_val, str):
                winner_val = winner_val.capitalize() if winner_val.lower() == 'tie' else winner_val.upper()
            
            criteria = data.get('criteria', {})
            scores = [d.get('score') for d in criteria.values() if isinstance(d, dict) and d.get('score') is not None]
            valid_scores = [int(s) for s in scores if isinstance(s, (int, float)) and 0 <= int(s) <= 5]
            
            df_analysis.loc[index, 'prometheus_winner'] = winner_val
            df_analysis.loc[index, 'prometheus_total_score'] = np.mean(valid_scores) if valid_scores else 0
            
            if winner_val != 'Parsing Error':
                continue

    except (json.JSONDecodeError, TypeError):
        # If it's not valid JSON or there's an error, just ignore it and move on to the regex.
        pass

    try:
        fixed_text = eval_text.strip()
        
        # Try to extract the main JSON
        json_match = re.search(r'\{[^{}]*"winner"[^{}]*\}', fixed_text, re.DOTALL)
        match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', fixed_text, re.IGNORECASE)
        if json_match:
            try:
                data = json.loads(json_match.group())
                
                # Extract winner
                winner_val = data.get('winner', 'Parsing Error')

                if winner_val == 'Parsing Error':
                    match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', eval_text, re.IGNORECASE)
                    if match:
                        winner_val = match.group(1)
                        
                if isinstance(winner_val, str):
                    winner_val = winner_val.capitalize() if winner_val.lower() == 'tie' else winner_val.upper()
                df_analysis.loc[index, 'prometheus_winner'] = winner_val
                
               # Extract scores from criteria
                criteria = data.get('criteria', {})
                if isinstance(criteria, dict):
                    scores = [
                        details.get('score') 
                        for details in criteria.values() 
                        if isinstance(details, dict) and details.get('score') is not None
                    ]
                    valid_scores = [int(s) for s in scores if isinstance(s, (int, float)) and 0 <= int(s) <= 5]
                    df_analysis.loc[index, 'prometheus_total_score'] = np.mean(valid_scores) if valid_scores else 0
                else:
                    regex = r'"score"\s*:\s*(\d+)'
                    scores = re.findall(regex, eval_text)
                    scores_numericos = [int(s) for s in scores]

                    df_analysis.loc[index, 'prometheus_total_score'] = np.mean(scores_numericos) if scores_numericos else 0
                continue
            except:
                pass
        
        # Attempt to fix and parse malformed JSON
        fixed_text = re.sub(r'"\s*"criteria"', '", "criteria"', fixed_text)
        fixed_text = re.sub(r'\}\s*\{', '}, {', fixed_text)
        if not fixed_text.strip().startswith('{') and '{' in fixed_text:
            fixed_text = '{' + fixed_text.split('{', 1)[-1]
        if fixed_text.count('{') > 1 and not fixed_text.strip().startswith('['):
            fixed_text = '[' + fixed_text + ']'

        data = json.loads(fixed_text)
        if isinstance(data, list):
            data = data[0] if data else {}

        winner_val = data.get('winner', 'Parsing Error')
        
        if winner_val == 'Parsing Error':
                match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', eval_text, re.IGNORECASE)
                if match:
                    winner_val = match.group(1)

        if isinstance(winner_val, str):
            winner_val = winner_val.capitalize() if winner_val.lower() == 'tie' else winner_val.upper()
        df_analysis.loc[index, 'prometheus_winner'] = winner_val
        
        criteria = data.get('criteria')
        if isinstance(criteria, dict):
            scores = [
                details.get('score') 
                for details in criteria.values() 
                if isinstance(details, dict) and details.get('score') is not None
            ]
            valid_scores = [int(s) for s in scores if isinstance(s, (int, float)) and 0 <= int(s) <= 5]
            df_analysis.loc[index, 'prometheus_total_score'] = np.mean(valid_scores) if valid_scores else 0
        else:
            regex = r'"score"\s*:\s*(\d+)'
            scores = re.findall(regex, eval_text)
            scores_numericos = [int(s) for s in scores]

            df_analysis.loc[index, 'prometheus_total_score'] = np.mean(scores_numericos) if scores_numericos else 0
        continue

    except Exception:
        pass
    
    # --- SECOND ATTEMPT: Regex "winner is..." format ---
    winner_match = re.search(r'winner is (?:Response )?["\']?([AB]|Tie)["\']?', eval_text, re.IGNORECASE)
    
    if winner_match:
        winner = winner_match.group(1).capitalize()
        df_analysis.loc[index, 'prometheus_winner'] = winner
        
        # Look for score
        score_match = re.search(r'with a score of (\d+)', eval_text, re.IGNORECASE)
        if score_match:
            df_analysis.loc[index, 'prometheus_total_score'] = int(score_match.group(1))
        else:
            # Try "overall score is X" or "score is X"
            overall_match = re.search(r'(?:overall\s+)?score\s+is\s+(\d+)', eval_text, re.IGNORECASE)
            if overall_match:
                df_analysis.loc[index, 'prometheus_total_score'] = int(overall_match.group(1))
            else:
                # Try to extract scores from criteria in the text
                scores = []
                for pattern in score_patterns:
                    matches = re.findall(pattern, eval_text, re.IGNORECASE)
                    scores.extend([int(m) for m in matches])
                
                # Filter valid scores (0-5)
                scores = [s for s in scores if 0 <= s <= 5]
                
                if not scores:
                    criteria_mentioned = sum([
                        1 for crit in ['logical_coherence', 'relevance_and_focus', 
                                       'accuracy_and_truthfulness', 'conciseness_and_clarity']
                        if re.search(crit.replace('_', r'[_\s]'), eval_text, re.IGNORECASE)
                    ])
                    
                    if criteria_mentioned > 0:
                        positive_words = len(re.findall(r'\b(comprehensive|detailed|coherent|well-structured|accurate|clear|concise|relevant|focused|logical|excellent|superior|better|good|strong|effective)\b', eval_text, re.IGNORECASE))
                        negative_words = len(re.findall(r'\b(not|lacking|poor|weak|unclear|inaccurate|irrelevant|inconsistent|insufficient|limited)\b', eval_text, re.IGNORECASE))
                        
                        if positive_words > negative_words:
                            scores = [5] * criteria_mentioned
                        elif positive_words > 0:
                            scores = [4] * criteria_mentioned
                        else:
                            scores = [3] * criteria_mentioned
                
                if scores:
                    df_analysis.loc[index, 'prometheus_total_score'] = np.mean(scores)
                else:
                    regex = r'"score"\s*:\s*(\d+)'
                    scores = re.findall(regex, eval_text)
                    scores_numericos = [int(s) for s in scores]

                    df_analysis.loc[index, 'prometheus_total_score'] = np.mean(scores_numericos) if scores_numericos else 0
        
        continue

    # --- THIRD ATTEMPT: Extract winner from other formats ---
    alt_winner = re.search(r'(?:the|overall)\s+winner\s+is\s+["\']?([AB]|Tie)["\']?', eval_text, re.IGNORECASE)
    if alt_winner:
        df_analysis.loc[index, 'prometheus_winner'] = alt_winner.group(1).upper()
        
        scores = []
        for pattern in score_patterns:
            matches = re.findall(pattern, eval_text, re.IGNORECASE)
            scores.extend([int(m) for m in matches])
        
        # Filter valid scores
        scores = [s for s in scores if 0 <= s <= 5]
        
        if not scores:
            criteria_mentioned = sum([
                1 for crit in ['logical_coherence', 'relevance_and_focus', 
                               'accuracy_and_truthfulness', 'conciseness_and_clarity']
                if re.search(crit.replace('_', r'[_\s]'), eval_text, re.IGNORECASE)
            ])
            
            if criteria_mentioned > 0:
                positive_words = len(re.findall(r'\b(comprehensive|detailed|coherent|well-structured|accurate|clear|concise|relevant|focused|logical|excellent|superior|better|good|strong|effective)\b', eval_text, re.IGNORECASE))
                negative_words = len(re.findall(r'\b(not|lacking|poor|weak|unclear|inaccurate|irrelevant|inconsistent|insufficient|limited)\b', eval_text, re.IGNORECASE))
                
                if positive_words > negative_words:
                    scores = [5] * criteria_mentioned
                elif positive_words > 0:
                    scores = [4] * criteria_mentioned
                else:
                    scores = [3] * criteria_mentioned
        
        df_analysis.loc[index, 'prometheus_total_score'] = np.mean(scores) if scores else 0
        continue

    # --- FINAL FALLBACK: Extract scores directly from text ---
    try:
        winner_fallback = re.search(r'winner\s+is\s+["\']?([AB]|Tie)["\']?', eval_text, re.IGNORECASE)
        if winner_fallback:
            df_analysis.loc[index, 'prometheus_winner'] = winner_fallback.group(1).capitalize()
        else:
            winner_val == 'Parsing Error'
            if winner_val == 'Parsing Error':
                    match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', eval_text, re.IGNORECASE)
                    if match:
                        winner_val = match.group(1)
                    
            df_analysis.loc[index, 'prometheus_winner'] = winner_val
        
        scores = []
        for pattern in score_patterns + [r':\s*(\d+)\s*\([^)]*response']:
            matches = re.findall(pattern, eval_text, re.IGNORECASE)
            scores.extend([int(m) for m in matches])
        
        # Filter valid scores
        scores = [s for s in scores if 0 <= s <= 5]

        if not scores:
            criteria_mentioned = sum([
                1 for crit in ['logical_coherence', 'relevance_and_focus', 
                               'accuracy_and_truthfulness', 'conciseness_and_clarity']
                if re.search(crit.replace('_', r'[_\s]'), eval_text, re.IGNORECASE)
            ])
            if criteria_mentioned > 0:
                positive_words = len(re.findall(r'\b(comprehensive|detailed|coherent|well-structured|accurate|clear|concise|relevant|focused|logical|excellent|superior|better|good|strong|effective)\b', eval_text, re.IGNORECASE))
                negative_words = len(re.findall(r'\b(not|lacking|poor|weak|unclear|inaccurate|irrelevant|inconsistent|insufficient|limited)\b', eval_text, re.IGNORECASE))
                if positive_words > negative_words:
                    scores = [5] * criteria_mentioned
                elif positive_words > 0:
                    scores = [4] * criteria_mentioned
                else:
                    scores = [3] * criteria_mentioned

            score = np.mean(scores) if scores else 0
            if score == 0:
                regex = r'"score"\s*:\s*(\d+)'
                scores = re.findall(regex, eval_text)
                scores_numericos = [int(s) for s in scores]
                score = np.mean(scores_numericos) if scores_numericos else 0

        df_analysis.loc[index, 'prometheus_total_score'] = score

    except Exception:
        winner_val == 'Parsing Error'
        if winner_val == 'Parsing Error':
            match = re.search(r'"winner"\s*:\s*"(Tie|A|B)"', eval_text, re.IGNORECASE)
            if match:
                winner_val = match.group(1)
                    
        df_analysis.loc[index, 'prometheus_winner'] = winner_val


        score = 0
        if score == 0:
            regex = r'"score"\s*:\s*(\d+)'
            scores = re.findall(regex, eval_text)
            scores_numericos = [int(s) for s in scores]
            score = np.mean(scores_numericos) if scores_numericos else 0

        df_analysis.loc[index, 'prometheus_total_score'] = score

print("Prometheus processing completed.")
print("Sample results for Prometheus:")
df_analysis[['evaluation_id', 'prometheus_winner', 'prometheus_total_score']].head()

Prometheus processing completed.
Sample results for Prometheus:


Unnamed: 0,evaluation_id,prometheus_winner,prometheus_total_score
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,T,5.0
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,Tie,4.75
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,T,4.75
3,CR019_gpt-4o_3_Creative_detailed_en_vs_pt,B,5.0
4,TC007_llama-3.3-70b-versatile_4_Technical_stru...,A,4.75


In [15]:
score_zero_count = (df_analysis['prometheus_total_score'] == 0).sum()
print(f"Count where 'prometheus_total_score' == 0: {score_zero_count}")

winner_nan_count = df_analysis['prometheus_winner'].isna().sum()
print(f"Count where 'prometheus_winner' is NaN: {winner_nan_count}")

parsing_error_count = (df_analysis['prometheus_winner'] == 'Parsing Error').sum()
print(f"Count where 'prometheus_winner' == 'Parsing Error': {parsing_error_count}")

Count where 'prometheus_total_score' == 0: 1
Count where 'prometheus_winner' is NaN: 0
Count where 'prometheus_winner' == 'Parsing Error': 0


In [16]:
colunas_winner = [col for col in df_analysis.columns if col.endswith('_winner')]
df_analysis[colunas_winner] = df_analysis[colunas_winner].replace('T', 'Tie')

In [17]:
df_analysis.head(3)

Unnamed: 0,evaluation_id,response_A,response_B,evaluation_claude,evaluation_prometheus,evaluation_mistral,claude_winner,claude_total_score,mistral_winner,mistral_total_score,prometheus_winner,prometheus_total_score
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,"{\n ""winner"": ""A"",\n ""general_justificat...","{\n""winner"": ""T"",\n""general_justification"": ""T...","{\n ""winner"": ""A"",\n ""general_j...",A,4.5,A,4.75,Tie,5.0
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,**Notação Big O: uma medida de complexidade**\...,A notação Big O é uma medida de complexidade d...,"{\n ""winner"": ""B"",\n ""general_justificat...","{\n""winner"": ""Tie"",\n""general_justification"": ...","{\n ""winner"": ""A"",\n ""general_j...",B,4.75,A,5.0,Tie,4.75
2,CG004_gemini-1.5-pro-latest_1_General Knowledg...,A Proclamação da República no Brasil em 15 de ...,"A Proclamação da República no Brasil, em 15 de...","{\n ""winner"": ""B"",\n ""general_justificat...","{\n""winner"": ""T"",\n""general_justification"": ""B...","{\n ""winner"": ""A"",\n ""general_j...",B,5.0,A,4.75,Tie,4.75


### committee judges consensus

In [18]:
def find_consensus(row):
    """
    Analisa os vencedores de cada juiz e retorna o consenso por maioria.
    """
    votes = [
        row['claude_winner'],
        row['mistral_winner'],
        row['prometheus_winner']
    ]
    
    valid_votes = [vote for vote in votes if vote != 'Parsing Error']
    if len(valid_votes) < 2:
        return 'No Consensus'
        

    vote_counts = Counter(valid_votes)
    
    for winner, count in vote_counts.items():
        if count >= 2:
            return winner

    return 'Tie'

In [19]:
df_analysis['gold_winner'] = df_analysis.apply(find_consensus, axis=1)
df_analysis.head(2)

Unnamed: 0,evaluation_id,response_A,response_B,evaluation_claude,evaluation_prometheus,evaluation_mistral,claude_winner,claude_total_score,mistral_winner,mistral_total_score,prometheus_winner,prometheus_total_score,gold_winner
0,CG011_llama-3.3-70b-versatile_2_General Knowle...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,**O que é DNA?**\n\nO DNA (ácido desoxirribonu...,"{\n ""winner"": ""A"",\n ""general_justificat...","{\n""winner"": ""T"",\n""general_justification"": ""T...","{\n ""winner"": ""A"",\n ""general_j...",A,4.5,A,4.75,Tie,5.0,A
1,TC018_llama-3.3-70b-versatile_3_Technical_mini...,**Notação Big O: uma medida de complexidade**\...,A notação Big O é uma medida de complexidade d...,"{\n ""winner"": ""B"",\n ""general_justificat...","{\n""winner"": ""Tie"",\n""general_justification"": ...","{\n ""winner"": ""A"",\n ""general_j...",B,4.75,A,5.0,Tie,4.75,Tie


In [20]:
df_analysis.to_csv('../../data/judged/committee_llm_consensus_lingual_pt.csv', index=False, encoding='utf-8')