In [202]:
import os
import re
import pandas as pd
from typing import Dict, Any, List, Optional, Callable
from opik import Opik
from collections import Counter


OPIK_DETERMINISM_PROJECT_NAME = "LLMmark_determinism"

In [203]:


def parse_model_info(model_name: str, tags: List[str]) -> Dict[str, str]:
    """
    Parses model ID, display name, and size from the model name and tags.
    Adjust this function based on your actual model naming conventions.
    """
    model_id = model_name
    model_size = "N/A"
    question_type = "N/A"

    match = re.search(r':([\d\.]+)b', model_name)
    if match:
        model_size = match.group(1) + "B"

    if "multiple_choice" in tags:
        question_type = "multiple_choice"
    elif "open_answer" in tags:
        question_type = "open_answer"

    return {
        "model_id": model_id,
        "model_size": model_size,
        "question_type": question_type
    }

def get_opik_flat_data_for_csv(project_name: str = OPIK_DETERMINISM_PROJECT_NAME) -> List[Dict[str, Any]]:
    """
    Fetches detailed trace and span data from Opik and flattens it for CSV export.
    Each dictionary in the returned list represents a single span,
    including its parent trace's metadata.
    """
    client = Opik()
    flat_data = []

    print(f"Fetching traces from project: {project_name}...")

    traces = client.search_traces(
        project_name=project_name,
        max_results=25000
    )

    if not traces:
        print(f"No traces found in project '{project_name}'. Please check the project name and your Opik configuration.")
        return []
    
    # Delete traces with None values
    traces = [trace for trace in traces if trace.name is not None]

    for i, trace in enumerate(traces):
        
        print(f"Processing trace {i+1}/{len(traces)}: {trace.name} ({trace.id})")

        trace_content = client.get_trace_content(trace.id)
        spans = client.search_spans(project_name=project_name, trace_id=trace.id)

        if not spans:
            print(f"  No spans found for trace {trace.id}. Skipping.")
            continue

        model_info = parse_model_info(trace.name, trace.tags)
        
        model_source = "N/A"
        if "local" in trace.tags:
            model_source = "local"
        elif "online" in trace.tags:
            model_source = "online"

        trace_flat_metadata = {
            "trace_id": trace.id,
            "run_name": trace.name,
            "model_display_name": trace_content.metadata.get("model_display_name"),
            "language": trace_content.metadata.get("language", "en"),
            "prompting_tech": trace_content.metadata.get("prompting_tech", "N/A"),
            "num_runs_per_question": trace_content.metadata.get("num_runs_per_question", 1),
            "model_source": model_source,
            "temperature": trace_content.metadata.get("temperature", "N/A"),
            "top_p": trace_content.metadata.get("top_p", 0.1),
            "exercise": trace_content.metadata.get("exercise", "N/A"),
            "question_type": model_info["question_type"],
            **{f"trace_meta_{k.replace('.', '_')}": v for k, v in trace_content.metadata.items() # Replace '.' in keys for valid column names
            if k not in ["language", "prompting_tech", "num_runs_per_question", 
                         "model_source", "temperature", "top_p", "exercise", 
                         "prompt_tech", "question_type", "comments", "model_id", "model_display_name", "top-p", "run_name"]}
        }

        # Process each span and combine with trace-level metadata
        for j, span in enumerate(spans):
            span_response_time_ms = span.output.get("response_time (s)", "N/A")

            span_input_question = span.input.get("question", str(span.input)) if isinstance(span.input, dict) else str(span.input)
            span_output_answer = span.output.get("answer", str(span.output)) if isinstance(span.output, dict) else str(span.output)
            span_output_raw_answer = span.output.get("raw_answer", span_output_answer) if isinstance(span.output, dict) else span_output_answer

            correct_answer = span.metadata.get("correct_answer", "PLACEHOLDER_CORRECT_ANSWER")

            span_data_row = {
                "span_id": span.id,
                "span_name": span.name,
                "response_time_ms": span_response_time_ms,
                "span_input_question": span_input_question,
                "span_output_answer": span_output_answer,
                "span_output_raw_answer": span_output_raw_answer,
                "span_correct_answer": correct_answer,
                "question_file": span.metadata.get("question_file", "N/A"),
            }
            
            combined_row = {**span_data_row, **trace_flat_metadata}
            flat_data.append(combined_row)

    return flat_data

def filter_and_save_dataframe(
    df: pd.DataFrame, 
    csv_filename: str = "opik_determinism_data.csv", 
    temperature_filter: Optional[float] = None
) -> None:
    """
    Filters an existing DataFrame by temperature and saves it to a CSV file.
    """
    if df.empty:
        print("\nInput DataFrame is empty. No CSV file will be created.")
        return

    filtered_df = df.copy()

    
    if temperature_filter is not None:
        filtered_df['temperature'] = pd.to_numeric(filtered_df['temperature'], errors='coerce')
        filtered_df = filtered_df[filtered_df['temperature'] == temperature_filter].copy()
        print(f"\nFiltered DataFrame for temperature = {temperature_filter}:")

    if filtered_df.empty:
        print(f"No data after filtering for temperature = {temperature_filter}.")
        return

    output_filename = csv_filename
    output_dir = '../../../data/determinism'
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)
    
    filtered_df.to_csv(output_path, index=False, encoding='utf-8')
    
    print("DataFrame head:")
    print(filtered_df.head())
    print(f"\nDataFrame shape: {filtered_df.shape}")

    print(f"\nSuccessfully extracted data and saved to {output_path}")
      
def get_dataframe_from_csv(csv_filename: str = "opik_determinism_data.csv") -> pd.DataFrame:
    """
    Reads a CSV file into a DataFrame.
    """
    output_dir = '../../../data/determinism'
    output_path = os.path.join(output_dir, csv_filename)
    
    if not os.path.exists(output_path):
        print(f"CSV file {output_path} does not exist.")
        return pd.DataFrame()
    
    df = pd.read_csv(output_path, encoding='utf-8')
    print(f"DataFrame loaded from {output_path} with shape: {df.shape}")
    
    return df

# Analyse multiple_choice answers
def calculate_determinism_mc(answers):
    """Calculates the determinism of a list of answers (multiple choice).

    Args:
        answers (list): A list of answers to evaluate.

    Returns:
        float: The determinism score for the given answers.
    """
    unique_answers = set(answers)

    if len(unique_answers) == 1:
        return 1.0  # Completely deterministic
    else:
        # Calculate the proportion of the most frequent answer
        counter = Counter(answers)
        most_frequent_answer = counter.most_common(1)[0][1]
        return most_frequent_answer / len(answers)

def extract_answer_letter_mc(answer_text: str) -> Optional[str]: 
    """
    Extracts the answer letter from the given text.
    The expected format is: [a]
    If multiple bracketed letters follow, only the first one is returned.
    """
    try:
        # Match the pattern [letter]
        match = re.search(r'\[([a-zA-Z])\]', answer_text)
        if match:
            return match.group(1)
        return None
    except Exception as e:
        print(f"Error extracting answer letter: {e}")
        return None
    
    
def process_determinism_and_store(df: pd.DataFrame, determinism_function: Callable[[List[str]], float]) -> pd.DataFrame:
    """
    Calculates determinism for each question_file, model_display_name, and prompting_tech,
    and stores the results in a new DataFrame.

    Args:
        df: The input DataFrame containing LLM experiment results.
        determinism_function: The function used to calculate determinism.

    Returns:
        A new DataFrame summarizing the determinism results.
    """
    results_list = []

    required_columns = ['model_display_name', 'question_file', 'prompting_tech', 'answer']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Input DataFrame must contain all of these columns: {required_columns}")

    unique_models = df['model_display_name'].unique()

    for model in unique_models:
        model_df = df[df['model_display_name'] == model].copy()

        # Group by question_file and prompting_tech
        grouped = model_df.groupby(['question_file', 'prompting_tech'])

        for (question_file, prompting_tech), group in grouped:
            answers_for_determinism = group['answer'].tolist()
            determinism_score = determinism_function(answers_for_determinism)

            results_list.append({
                'Model Display Name': model,
                'Question File': question_file,
                'Prompting Tech': prompting_tech,
                'Determinism Score': determinism_score,
                'Number of Runs': len(group),
                'All Answers': answers_for_determinism
            })
    results_df = pd.DataFrame(results_list)
    print("Determinism calculation complete.")

    results_df = results_df.sort_values(by=['Model Display Name', 'Question File', 'Prompting Tech'])
    return results_df


def create_determinism_table(df, filename="determinism_summary_table.csv"):
    """Creates a summary table of determinism scores and saves it to a CSV file.

    Args:
        df (pd.DataFrame): The input DataFrame containing LLM experiment results.
        filename (str, optional): The name of the output CSV file. Defaults to "determinism_summary_table.csv".
    """
    print(f"\nColumns in {df}:")
    print(df.columns.tolist())
    
    df = df.sort_values(by=['model_display_name', 'question_file', 'prompting_tech'])


    determinism_table = process_determinism_and_store(df, calculate_determinism_mc)

    print("\n--- Determinism Results Table ---")
    print(determinism_table)
    
    # Save the determinism table to a CSV file
    output_dir_tables = '../../../data/determinism_tables'
    os.makedirs(output_dir_tables, exist_ok=True)
    table_filepath = os.path.join(output_dir_tables, filename)
    determinism_table.to_csv(table_filepath, index=False)
    print(f"\nDeterminism table saved to: {table_filepath}")


In [None]:
# Get all the data of Opik determinism project
all_opik_data = get_opik_flat_data_for_csv()

if all_opik_data:
    full_df = pd.DataFrame(all_opik_data)
    print(f"\nFull DataFrame loaded with shape: {full_df.shape}")
    
    # Save full dataframe
    filter_and_save_dataframe(full_df)

    # Dataframe with temperature=0.0
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_00.csv", temperature_filter=0.0)

    # Dataframe with temperature=0.2
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_02.csv", temperature_filter=0.2)
    
    # Dataframe with temperature=0.4
    filter_and_save_dataframe(full_df, csv_filename="opik_determinism_data_temp_04.csv", temperature_filter=0.4)

else:
    print("No data fetched from Opik to create any CSV files.")

Fetching traces from project: LLMmark_determinism...
Processing trace 1/240: run_024_qwen3:4b (0197ad5f-3790-7b31-bc21-2c8b7dacf46e)
Processing trace 2/240: run_024_qwen3:1.7b (0197ad53-5121-7526-994e-38e9837f24f4)
Processing trace 3/240: run_024_qwen3:0.6b (0197ad49-38d9-7c0f-adaf-61faddb79b82)
Processing trace 4/240: run_024_tinyllama:1.1b (0197ad45-8838-7a45-a637-c200af953229)
Processing trace 5/240: run_024_smollm2:1.7b (0197ad41-62a9-71c5-a2ca-734c3a8caa0f)
Processing trace 6/240: run_024_moondream:1.8b (0197ad41-5941-7945-8b7b-414d2b87fd21)
Processing trace 7/240: run_024_llama3.2:1b (0197ad3f-b0ad-75af-b838-347ddaec3d84)
Processing trace 8/240: run_024_gemma3:4b (0197ad39-5e59-78cd-a03f-d90595dd9047)
Processing trace 9/240: run_024_gemma3:1b (0197ad35-a5f0-7605-9167-aa447c155de1)
Processing trace 10/240: run_024_deepseek-r1:1.5b (0197ad27-3834-7027-88a3-4dea2d3aacde)
Processing trace 11/240: run_023_qwen3:4b (0197ad15-8c3b-761c-bb22-ee65aee74ff7)
Processing trace 12/240: run_023

In [None]:

# TEMPERATURE = 0.0
csv_filename="opik_determinism_data_temp_00.csv"
temp_00_df = get_dataframe_from_csv(csv_filename=csv_filename)
# Filter by question_type
# temp_00_mc_df = temp_00_df[temp_00_df['question_type'] == 'open_answer'].copy()
temp_00_mc_df = temp_00_df[temp_00_df['question_type'] == 'multiple_choice'].copy()


#Extract the answer letter from multiple choice answers
temp_00_mc_df['answer'] = temp_00_mc_df['span_output_answer'].apply(extract_answer_letter_mc)

# Rename span_correct_answer to correct_answer
temp_00_mc_df.rename(columns={'span_correct_answer': 'correct_answer'}, inplace=True)
# Drop all columns except run_name, question_file, answer, span_correct_answer, promtpting_tech
temp_00_mc_df = temp_00_mc_df[['run_name', 'model_display_name', 'question_file', 'answer', 'correct_answer', 'prompting_tech']]   

# Order by question_file
temp_00_mc_df = temp_00_mc_df.sort_values(by=['question_file', 'answer']).reset_index(drop=True)

temp_00_mc_df.head(5).style.set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('color', 'black')]}]
).set_properties(**{'text-align': 'center'})



DataFrame loaded from ../../../data/determinism/opik_determinism_data_temp_00.csv with shape: (8000, 19)
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got 'float'
Error extracting answer letter: expected string or bytes-like object, got '

Unnamed: 0,run_name,model_display_name,question_file,answer,correct_answer,prompting_tech
0,run_022_qwen3:4b,Qwen3:4b,question_01.txt,b,b,R4
1,run_022_qwen3:4b,Qwen3:4b,question_01.txt,b,b,R4
2,run_022_qwen3:4b,Qwen3:4b,question_01.txt,b,b,R4
3,run_022_qwen3:4b,Qwen3:4b,question_01.txt,b,b,R4
4,run_022_qwen3:4b,Qwen3:4b,question_01.txt,b,b,R4


In [None]:
# Get unique model_display_name values
unique_models = temp_00_mc_df['model_display_name'].unique()

print("MODELS: ", unique_models)


MODELS:  ['Qwen3:4b' 'Qwen3:1.7b' 'Qwen3:0.6b' 'TinyLlama:1.1b' 'smollm2:1.7b'
 'Moondream 2' 'Llama3.2:1b' 'Gemma3:4b' 'Gemma3:1b' 'DeepSeek R1:1.5b']


In [None]:
create_determinism_table(temp_00_mc_df, filename="determinism_table_temp_00.csv")


Columns in                       run_name model_display_name    question_file answer  \
0             run_022_qwen3:4b           Qwen3:4b  question_01.txt      b   
1             run_022_qwen3:4b           Qwen3:4b  question_01.txt      b   
2             run_022_qwen3:4b           Qwen3:4b  question_01.txt      b   
3             run_022_qwen3:4b           Qwen3:4b  question_01.txt      b   
4             run_022_qwen3:4b           Qwen3:4b  question_01.txt      b   
...                        ...                ...              ...    ...   
3995  run_002_deepseek-r1:1.5b   DeepSeek R1:1.5b  question_10.txt   None   
3996  run_002_deepseek-r1:1.5b   DeepSeek R1:1.5b  question_10.txt   None   
3997  run_002_deepseek-r1:1.5b   DeepSeek R1:1.5b  question_10.txt   None   
3998  run_002_deepseek-r1:1.5b   DeepSeek R1:1.5b  question_10.txt   None   
3999  run_002_deepseek-r1:1.5b   DeepSeek R1:1.5b  question_10.txt   None   

     correct_answer prompting_tech  
0                 b       