In [None]:
import os
import re
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional, Callable
from opik import Opik
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

SENTENCE_TRANSFORMER_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

OPIK_DETERMINISM_PROJECT_NAME = "LLMmark_determinism"

EMPTY_PLACEHOLDER = '_EMPTY_ANSWER_FAILURE_'
TIMEOUT_PLACEHOLDER = '_TIMEOUT_FAILURE_'
OVERTHINK_PLACEHOLDER = '_OVERTHINK_FAILURE_'
BAD_FORMAT_PLACEHOLDER = '_BAD_FORMAT_FAILURE_'


In [17]:
def preprocess_and_identify_failures(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identifies failed responses in the DataFrame based on timeout or token limit.
    """
    timeout = 120
    
    is_empty = df['span_output_raw_answer'].fillna('').str.strip() == ''
    
    # It is considered timeout if the response time is > 120 seconds and the answer is empty
    # Some models like gemma3n can take more than 120 seconds because of the memory limit (we use swap memory)
    is_timeout = (df['response_time'] >= timeout).fillna(False) & is_empty
    
    
    # If the raw_answer contains <think>, we consider that it has reached the token limit or an infinite loop
    is_token_limit_hit = df['span_output_raw_answer'].str.contains('<think>', na=False)
    
    df['is_failure'] = is_timeout | is_token_limit_hit | is_empty
    
    conditions = [
        is_timeout,
        is_token_limit_hit,
        is_empty
    ]
    
    placeholders = [
        TIMEOUT_PLACEHOLDER,
        OVERTHINK_PLACEHOLDER,
        EMPTY_PLACEHOLDER
    ]
    
    # Apply the failure placeholders in the answer column, else keep the original raw answer
    df['answer'] = np.select(conditions, placeholders, default=df['span_output_raw_answer'])

    failure_count = df['is_failure'].sum()
    if failure_count > 0:
        print(f"INFO: Total of {failure_count} failed responses (timeout, empty answer or infinite loop).")

    return df


def parse_model_info(model_name: str, tags: List[str]) -> Dict[str, str]:
    """
    Parses model ID, display name, and size from the model name and tags.
    Adjust this function based on your actual model naming conventions.
    """
    model_id = model_name
    model_size = "N/A"
    question_type = "N/A"

    match = re.search(r':([\d\.]+)b', model_name)
    if match:
        model_size = match.group(1) + "B"

    if "multiple_choice" in tags:
        question_type = "multiple_choice"
    elif "open_answer" in tags:
        question_type = "open_answer"

    return {
        "model_id": model_id,
        "model_size": model_size,
        "question_type": question_type
    }

def get_opik_flat_data_for_csv(project_name: str = OPIK_DETERMINISM_PROJECT_NAME) -> List[Dict[str, Any]]:
    """
    Fetches detailed trace and span data from Opik and flattens it for CSV export.
    Each dictionary in the returned list represents a single span,
    including its parent trace's metadata.
    """
    client = Opik()
    flat_data = []

    print(f"Fetching traces from project: {project_name}...")

    traces = client.search_traces(
        project_name=project_name,
        max_results=25000
    )

    if not traces:
        print(f"No traces found in project '{project_name}'. Please check the project name and your Opik configuration.")
        return []
    
    # Delete traces with None values
    traces = [trace for trace in traces if trace.name is not None]
    
    # :TODO: REMOVE: - Delete traces with gemma3n model
    traces = [trace for trace in traces if 'gemma3n' not in trace.name]

    for i, trace in enumerate(traces):
        
        print(f"Processing trace {i+1}/{len(traces)}: {trace.name} ({trace.id})")

        trace_content = client.get_trace_content(trace.id)
        spans = client.search_spans(project_name=project_name, trace_id=trace.id)

        if not spans:
            print(f"  No spans found for trace {trace.id}. Skipping.")
            continue

        model_info = parse_model_info(trace.name, trace.tags)
        
        model_source = "N/A"
        if "local" in trace.tags:
            model_source = "local"
        elif "online" in trace.tags:
            model_source = "online"

        trace_flat_metadata = {
            "trace_id": trace.id,
            "run_name": trace.name,
            "model_display_name": trace_content.metadata.get("model_display_name"),
            "language": trace_content.metadata.get("language", "en"),
            "prompting_tech": trace_content.metadata.get("prompting_tech", "N/A"),
            "num_runs_per_question": trace_content.metadata.get("num_runs_per_question", 1),
            "model_source": model_source,
            "temperature": trace_content.metadata.get("temperature", "N/A"),
            "top_p": trace_content.metadata.get("top_p", 0.1),
            "exercise": trace_content.metadata.get("exercise", "N/A"),
            "question_type": model_info["question_type"],
            **{f"trace_meta_{k.replace('.', '_')}": v for k, v in trace_content.metadata.items() # Replace '.' in keys for valid column names
            if k not in ["language", "prompting_tech", "num_runs_per_question", 
                         "model_source", "temperature", "top_p", "exercise", 
                         "prompt_tech", "question_type", "comments", "model_id", "model_display_name", "top-p", "run_name"]}
        }

        # Process each span and combine with trace-level metadata
        for j, span in enumerate(spans):
            span_response_time = span.output.get("response_time (s)", "N/A")

            span_input_question = span.input.get("question", str(span.input)) if isinstance(span.input, dict) else str(span.input)
            span_output_answer = span.output.get("answer", str(span.output)) if isinstance(span.output, dict) else str(span.output)
            span_output_raw_answer = span.output.get("raw_answer", span_output_answer) if isinstance(span.output, dict) else span_output_answer

            correct_answer = span.metadata.get("correct_answer", "PLACEHOLDER_CORRECT_ANSWER")
            
            
            span_usage = span.usage or {} # Usamos un diccionario vacío si 'usage' es None
            completion_tokens = span_usage.get("completion_tokens", 0)

            span_data_row = {
                "span_id": span.id,
                "span_name": span.name,
                "response_time": span_response_time,
                "span_input_question": span_input_question,
                "span_output_answer": span_output_answer,
                "span_output_raw_answer": span_output_raw_answer,
                "span_correct_answer": correct_answer,
                "completion_tokens": completion_tokens,
                "question_file": span.metadata.get("question_file", "N/A"),
            }
            
            combined_row = {**span_data_row, **trace_flat_metadata}
            flat_data.append(combined_row)

    return flat_data

def filter_and_save_dataframe(
    df: pd.DataFrame, 
    csv_filename: str = "opik_determinism_data.csv", 
    temperature_filter: Optional[float] = None
) -> None:
    """
    Filters an existing DataFrame by temperature and saves it to a CSV file.
    """
    if df.empty:
        print("\nInput DataFrame is empty. No CSV file will be created.")
        return

    filtered_df = df.copy()
    
    print("DataFrame after preprocessing:")
    print(filtered_df.head())

    if temperature_filter is not None:
        filtered_df['temperature'] = pd.to_numeric(filtered_df['temperature'], errors='coerce')
        filtered_df = filtered_df[filtered_df['temperature'] == temperature_filter].copy()
        print(f"\nFiltered DataFrame for temperature = {temperature_filter}:")

    if filtered_df.empty:
        print(f"No data after filtering for temperature = {temperature_filter}.")
        return

    output_filename = csv_filename
    output_dir = '../../../data/determinism'
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)
    
    filtered_df.to_csv(output_path, index=False, encoding='utf-8')
    
    print("DataFrame head:")
    print(filtered_df.head())
    print(f"\nDataFrame shape: {filtered_df.shape}")

    print(f"\nSuccessfully extracted data and saved to {output_path}")
      
def get_dataframe_from_csv(csv_filename: str = "opik_determinism_data.csv") -> pd.DataFrame:
    """
    Reads a CSV file into a DataFrame.
    """
    output_dir = '../../../data/determinism'
    output_path = os.path.join(output_dir, csv_filename)
    
    if not os.path.exists(output_path):
        print(f"CSV file {output_path} does not exist.")
        return pd.DataFrame()
    
    df = pd.read_csv(output_path, encoding='utf-8')
    print(f"DataFrame loaded from {output_path} with shape: {df.shape}")
    
    return df

# Analyse multiple_choice answers
def calculate_determinism_mc(answers):
    """Calculates the determinism of a list of answers (multiple choice).

    Args:
        answers (list): A list of answers to evaluate.

    Returns:
        float: The number of most frequent answer.
    """
    unique_answers = set(answers)

    if len(unique_answers) == 1:
        return len(answers)  # Completely deterministic
    else:
        # Calculate the proportion of the most frequent answer
        counter = Counter(answers)
        most_frequent_answer = counter.most_common(1)[0][1]
        return most_frequent_answer

def extract_answer_letter_mc(answer_text: str) -> Optional[str]: 
    """
    Extracts the answer letter from the given text.
    The expected format is: [a]
    If multiple bracketed letters follow, only the first one is returned.
    """
    try:
        # Match the pattern [letter]
        match = re.search(r'\[([a-zA-Z])\]', answer_text)
        if match:
            return match.group(1)
        return None
    except Exception as e:
        # print(f"Error extracting answer letter: {e}")
        return None

import pandas as pd
import numpy as np
from typing import List, Callable, Dict, Any

def process_determinism_and_store(df: pd.DataFrame, determinism_function: Callable[[List[str]], float]) -> pd.DataFrame:
    """Processes the DataFrame to calculate determinism scores and store the results.

    Args:
        df (pd.DataFrame): The input DataFrame containing response data.
        determinism_function (Callable[[List[str]], float]): The function to use for calculating determinism
                                                             (e.g., calculate_determinism_mc or calculate_determinism_oa).

    Raises:
        ValueError: If the input DataFrame is missing required columns.

    Returns:
        pd.DataFrame: A new DataFrame summarizing the determinism results.
    """

    results_list = []

    required_columns = ['model_display_name', 'question_file', 'prompting_tech', 'answer', 'is_failure']
    if not all(col in df.columns for col in required_columns):
        missing = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"The DataFrame does not have the columns: {required_columns}. Missing: {missing}")

    grouping_keys = ['model_display_name', 'question_file', 'prompting_tech']
    df_grouped = df.groupby(grouping_keys)

    for group_id, group in df_grouped:        
        all_answers = group['answer'].tolist()
        successful_answers = group.loc[~group['is_failure'], 'answer'].tolist()
        
        num_total_runs = len(group)
        
        # get number of is_failures with value True
        num_failures = group['is_failure'].value_counts().get(True, 0)
        
        num_successful = num_total_runs - num_failures
        
        if (num_successful == 0):
            print(f"No successful runs for group {group_id}")
            determinism = 0.0
        else:
            determinism_score = determinism_function(successful_answers)
            determinism = round((determinism_score / num_successful), 1)


        results_list.append({
            'Model Display Name': group_id[0],
            'Question File': group_id[1],
            'Prompting Tech': group_id[2],
            'Determinism Score': determinism,
            'Number of Runs': num_total_runs,
            'Number of Failures': num_failures,
            'All Answers': all_answers,
        })
        
    if not results_list:
        print("No determinism results found. The input DataFrame may be empty or not contain valid data.")
        return pd.DataFrame()

    results_df = pd.DataFrame(results_list)

    results_df = results_df.sort_values(by=['Model Display Name', 'Question File', 'Prompting Tech']).reset_index(drop=True)
    
    return results_df

def calculate_determinism_oa(answers: List[str], umbral=0.8) -> float:
    """Calculate the determinism of open answer questions.

    Args:
        answers (List[str]): The list of answers to evaluate.
        umbral (float, optional): The similarity threshold for considering answers as similar. Defaults to 0.8.

    Returns:
        float: The determinism score between 0 and 1.
    """
    if not answers:
        return 0.0  # No answers, no determinism

    cleaned_answers = [str(ans) if pd.notna(ans) else '' for ans in answers]
    
    # Remove failure answers
    cleaned_answers = [ans for ans in cleaned_answers if ans != '_GENERATION_FAILURE_']
    
    if len(cleaned_answers) < 2:
        return 0.0
     
    embeddings = SENTENCE_TRANSFORMER_MODEL.encode(cleaned_answers)

    similarities = []
    for i in range(len(cleaned_answers)):
        for j in range(i+1, len(cleaned_answers)):
            sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
            similarities.append(sim)

    return np.mean(similarities) if similarities else 0.0

def create_determinism_table(df, filename="determinism_summary_table.csv", function=calculate_determinism_oa):
    """Creates a summary table of determinism scores and saves it to a CSV file.

    Args:
        df (pd.DataFrame): The input DataFrame containing LLM experiment results.
        filename (str, optional): The name of the output CSV file. Defaults to "determinism_summary_table.csv".
    """
    print(f"\nColumns in {df}:")
    print(df.columns.tolist())
    
    df = df.sort_values(by=['model_display_name', 'question_file', 'prompting_tech'])


    determinism_table = process_determinism_and_store(df, function)

    print("\n--- Determinism Results Table ---")
    print(determinism_table)
    
    # Save the determinism table to a CSV file
    output_dir_tables = '../../../data/determinism_tables'
    os.makedirs(output_dir_tables, exist_ok=True)
    table_filepath = os.path.join(output_dir_tables, filename)
    determinism_table.to_csv(table_filepath, index=False)
    print(f"\nDeterminism table saved to: {table_filepath}")
    


def generate_determinism_table_mc(mc_df, filename):
    """Generates a determinism table for multiple choice questions.

    Args:
        mc_df (pd.DataFrame): DataFrame containing multiple choice question data.
        filename (str): Name of the output CSV file.
    """
    # Filter by question_type
    mc_df = mc_df[mc_df['question_type'] == 'multiple_choice'].copy()

    # Extract the answer letter from multiple choice answers only if they do not is_failure=True    
    mc_df['answer'] = mc_df['span_output_answer'].apply(extract_answer_letter_mc)

    # Rename span_correct_answer to correct_answer
    mc_df.rename(columns={'span_correct_answer': 'correct_answer'}, inplace=True)
    # Drop all columns except run_name, question_file, answer, span_correct_answer, promtpting_tech
    mc_df = mc_df[['run_name', 'model_display_name', 'question_file', 'answer', 'correct_answer', 'prompting_tech', 'is_failure']]

    # Order by question_file
    mc_df = mc_df.sort_values(by=['question_file', 'answer']).reset_index(drop=True)

    mc_df.head(5).style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('color', 'black')]}]
    ).set_properties(**{'text-align': 'center'})
    
    
    # Get unique model_display_name values
    unique_models = mc_df['model_display_name'].unique()
    print("MODELS: ", unique_models)
    
    # Create CSV determinism table
    create_determinism_table(mc_df, filename=filename, function=calculate_determinism_mc)


def generate_determinism_table_oa(mc_df, filename):
    """Generates a determinism table for open answer questions.

    Args:
        mc_df (pd.DataFrame): DataFrame containing open answer question data.
        filename (str): Name of the output CSV file.
    """
    # Filter by question_type
    mc_df = mc_df[mc_df['question_type'] == 'open_answer'].copy()

    #Extract the answer letter from multiple choice answers
    mc_df['answer'] = mc_df['span_output_answer']

    # Rename span_correct_answer to correct_answer
    mc_df.rename(columns={'span_correct_answer': 'correct_answer'}, inplace=True)
    # Drop all columns except run_name, question_file, answer, span_correct_answer, promtpting_tech
    mc_df = mc_df[['run_name', 'model_display_name', 'question_file', 'answer', 'correct_answer', 'prompting_tech', 'is_failure']]

    # Order by question_file
    mc_df = mc_df.sort_values(by=['question_file', 'answer']).reset_index(drop=True)

    mc_df.head(5).style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('color', 'black')]}]
    ).set_properties(**{'text-align': 'center'})
    
    
    # Get unique model_display_name values
    unique_models = mc_df['model_display_name'].unique()
    print("MODELS: ", unique_models)
    
    
    
    # Create CSV determinism table
    create_determinism_table(mc_df, filename=filename, function=calculate_determinism_oa)


In [18]:
# Get all the data of Opik determinism project
all_opik_data = get_opik_flat_data_for_csv()


Fetching traces from project: LLMmark_determinism...
Processing trace 1/240: run_049_qwen3:4b_0.2 (0197c174-ebcd-7404-97e0-3ba8fa383e50)
Processing trace 2/240: run_049_qwen3:1.7b_0.2 (0197c16b-ebc8-7aa4-b737-d650e97f8916)
Processing trace 3/240: run_049_qwen3:0.6b_0.2 (0197c15a-d25e-7881-a4db-21d293f536fe)
Processing trace 4/240: run_048_qwen3:4b_0.2 (0197c14f-4be2-7d1e-86e4-743f7c49f93f)
Processing trace 5/240: run_049_tinyllama:1.1b_0.2 (0197c14e-44f1-71ef-a8ab-255f8f776f5a)
Processing trace 6/240: run_049_smollm2:1.7b_0.2 (0197c14a-40af-7385-a226-2c9b9ed9d809)
Processing trace 7/240: run_049_moondream:1.8b_0.2 (0197c14a-309e-7ec8-a55c-c27e6b230250)
Processing trace 8/240: run_049_llama3.2:1b_0.2 (0197c147-32a1-76dc-b04c-d006dbd74b8b)
Processing trace 9/240: run_048_qwen3:1.7b_0.2 (0197c13e-b877-729a-b73f-c6d6fd4cd48c)
Processing trace 10/240: run_048_qwen3:0.6b_0.2 (0197c134-c8ae-7c68-a166-45961d80acd5)
Processing trace 11/240: run_049_gemma3:4b_0.2 (0197c134-4202-76ce-9b06-2429467

In [19]:

if all_opik_data:
    full_df = pd.DataFrame(all_opik_data).copy()
    print(f"\nFull DataFrame loaded with shape: {full_df.shape}")
    
    full_df = preprocess_and_identify_failures(full_df)
    
    # Save full dataframe
    filter_and_save_dataframe(full_df)

    # Dataframe with temperature=0.0
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_00.csv", temperature_filter=0.0)

    # Dataframe with temperature=0.2
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_02.csv", temperature_filter=0.2)
    
    # Dataframe with temperature=0.4
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_04.csv", temperature_filter=0.4)

else:
    print("No data fetched from Opik to create any CSV files.")


Full DataFrame loaded with shape: (24000, 20)
INFO: Total of 2726 failed responses (timeout, empty answer or infinite loop).
DataFrame after preprocessing:
                                span_id span_name  response_time  \
0  0197c181-c40a-773e-a48c-f2fd9a6caf63   q10_r10         11.648   
1  0197c181-968a-7f04-91f8-d694be080ada    q10_r9         28.038   
2  0197c181-2903-722b-98c9-7d680961893a    q10_r8         11.675   
3  0197c180-fb68-7c6c-bb8a-865d9e9577ac    q10_r7         28.049   
4  0197c180-8dd7-7187-ac09-5b52f5458ab0    q10_r6         11.668   

                                 span_input_question span_output_answer  \
0  What is the sum of the binary numbers 11100101...             [c][c]   
1  What is the sum of the binary numbers 11100101...          [c][c][c]   
2  What is the sum of the binary numbers 11100101...             [c][c]   
3  What is the sum of the binary numbers 11100101...          [c][c][c]   
4  What is the sum of the binary numbers 11100101...       

In [20]:

# TEMPERATURE = 0.0
csv_filename="opik_determinism_data_temp_00.csv"
temp_00_df = get_dataframe_from_csv(csv_filename=csv_filename)
# Multiple choice answers
temp_00_mc_df = temp_00_df[temp_00_df['question_type'] == 'multiple_choice'].copy()
output_filename = "determinism_table_temp_00_mc.csv"
generate_determinism_table_mc(temp_00_mc_df, filename=output_filename)
# Open answer questions
temp_00_oa_df = temp_00_df[temp_00_df['question_type'] == 'open_answer'].copy()
output_filename = "determinism_table_temp_00_oa.csv"
generate_determinism_table_oa(temp_00_oa_df, filename=output_filename)


# TEMPERATURE = 0.2
csv_filename="opik_determinism_data_temp_02.csv"
temp_02_df = get_dataframe_from_csv(csv_filename=csv_filename)
# Multiple choice answers
temp_02_mc_df = temp_02_df[temp_02_df['question_type'] == 'multiple_choice'].copy()
output_filename = "determinism_table_temp_02_mc.csv"
generate_determinism_table_mc(temp_02_mc_df, filename=output_filename)
# Open answer questions
temp_02_oa_df = temp_02_df[temp_02_df['question_type'] == 'open_answer'].copy()
output_filename = "determinism_table_temp_02_oa.csv"
generate_determinism_table_oa(temp_02_oa_df, filename=output_filename)

# TEMPERATURE = 0.4
csv_filename="opik_determinism_data_temp_04.csv"
temp_04_df = get_dataframe_from_csv(csv_filename=csv_filename)
# Multiple choice answers
temp_04_mc_df = temp_04_df[temp_04_df['question_type'] == 'multiple_choice'].copy()
output_filename = "determinism_table_temp_04_mc.csv"
generate_determinism_table_mc(temp_04_mc_df, filename=output_filename)
# Open answer questions
temp_04_oa_df = temp_04_df[temp_04_df['question_type'] == 'open_answer'].copy()
output_filename = "determinism_table_temp_04_oa.csv"
generate_determinism_table_oa(temp_04_oa_df, filename=output_filename)





DataFrame loaded from ../../../data/determinism/opik_determinism_data_temp_00.csv with shape: (8000, 22)
MODELS:  ['Qwen3:1.7b' 'Qwen3:0.6b' 'smollm2:1.7b' 'DeepSeek R1:1.5b' 'Gemma3:4b'
 'Gemma3:1b' 'Qwen3:4b' 'TinyLlama:1.1b' 'Llama3.2:1b' 'Moondream 2']

Columns in                           run_name model_display_name    question_file answer  \
0           run_047_qwen3:1.7b_0.0         Qwen3:1.7b  question_01.txt      a   
1           run_047_qwen3:1.7b_0.0         Qwen3:1.7b  question_01.txt      a   
2           run_047_qwen3:1.7b_0.0         Qwen3:1.7b  question_01.txt      a   
3           run_047_qwen3:1.7b_0.0         Qwen3:1.7b  question_01.txt      a   
4           run_047_qwen3:1.7b_0.0         Qwen3:1.7b  question_01.txt      a   
...                            ...                ...              ...    ...   
3995  run_001_deepseek-r1:1.5b_0.0   DeepSeek R1:1.5b  question_10.txt   None   
3996  run_001_deepseek-r1:1.5b_0.0   DeepSeek R1:1.5b  question_10.txt   None   
39