In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import sys
import pandas as pd
import ast
import re

In [3]:
sys.path.append(os.path.join(os.getcwd(), '..'))

In [4]:
from evaluation_metrics import get_user_content, get_assistant_content, evaluate_model_output_entities, evaluate_model_output_start_positions, evaluate_model_output_end_positions
from models.fine_tuning_ner import apply_ner_to_text_fine_tuned
from models.prompting_ner import apply_ner_to_text_openai, apply_ner_to_text_anthropic
from models.gliner_ner import apply_ner_to_text_gliner

Selected enviroment: d:\Workspace\ner_alternatives\src\lib\evaluation\..\utils\..\..\..\.env
Selected enviroment: d:\Workspace\ner_alternatives\src\lib\evaluation\..\utils\..\..\..\.env


  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]


## Obtain results of the NER alternatives:

In [225]:
# Ruta al archivo JSONL
file_path = os.path.join(os.getcwd(), '..', '..', '..', 'resources', 'evaluation', 'validation_in_production_data.jsonl')        

In [235]:
empty_result = {
    'adverse_drug_reactions': [], 
    'diseases_or_medical_conditions': [], 
    'medications': [], 
    'clinical_findings': [], 
    'symptoms_experienced_by_patients': []
}

In [23]:

ner_alternatives = [
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o', 'ft:gpt-4o-2024-08-06:personal:150-sample-gpt-4o:A34xm7HX'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_v2', 'ft:gpt-4o-2024-08-06:personal:1000-sample-gpt-4o:A3S2iooJ'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_mini', 'ft:gpt-4o-mini-2024-07-18:personal:100-sample-gpt-4o-mini:A34sYctP'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_mini_v2', 'ft:gpt-4o-mini-2024-07-18:personal:1000-sample-gpt-4o-mini:A3sZemiq'),
    (apply_ner_to_text_openai, 'gpt_4o', 'gpt-4o-2024-08-06'),
    (apply_ner_to_text_openai, 'gpt_4o_mini', 'gpt-4o-mini'),
    (apply_ner_to_text_anthropic, 'sonnet_35', 'claude-3-5-sonnet-20240620'),
    (apply_ner_to_text_gliner, 'gliner', None)
]

In [25]:
df = pd.DataFrame(columns=['sample_id', 'model_name', 'expected_output', 'output'])

# Abrir y leer el archivo línea por línea
with open(file_path, 'r', encoding='utf-8') as file:
    for idx, line in enumerate(file):
        try:
            data = json.loads(line.strip())
            input = get_user_content(data)
            expected_output = get_assistant_content(data)
            expected_output_dict = ast.literal_eval(expected_output.replace('\"', '\''))
            print(f"Expected output sample {idx}: {expected_output_dict}")
            for function, model_name, model_id in ner_alternatives:
                try:
                    # Apply the alternative
                    output_dict = function(input, model_id)
                    if model_name in ['gpt_4o', 'gpt_4o_mini']: # 'output_dict' is not a dict yet, is a string and should be converted
                        output_dict = ast.literal_eval(output_dict.replace('\"', '\''))

                    print(f"Output sample {idx} model {model_name}: {output_dict}")
                    
                    # Add results to the dataframe
                    new_row = pd.DataFrame([{
                        'model_name': model_name,
                        'sample_id': idx,
                        'expected_output': expected_output_dict,
                        'output': output_dict,
                    }])
                    df = pd.concat([df, new_row], ignore_index=True)
                except Exception as e:
                    print(f"Error sample {idx}: {e}:")
                    print(f"Expected output sample {idx}: {expected_output}")
                    print(f"Output sample {idx}: {output_dict}")
                    
                    new_row = pd.DataFrame([{
                        'model_name': model_name,
                        'sample_id': idx,
                        'expected_output': empty_result,
                        'output': empty_result,
                    }])
                    df = pd.concat([df, new_row], ignore_index=True)    
        except Exception as e:
            print(f"Error sample {idx}: {e}")
            pass


Expected output sample 0: {'adverse_drug_reactions': [('Extremely dry mouth', 0, 19), ('severe joint aches', 21, 39), ('muscle cramps in lower legs', 41, 68), ('difficulty breathing', 112, 132), ('tired', 134, 139), ('memory loss', 141, 152), ('mild depression', 154, 169), ('diminished eyesight', 171, 190), ('forgetfulness', 360, 373), ('memory failure', 396, 410), ('swelling', 525, 533), ('muscle aches', 559, 571), ('joint pain', 576, 586), ('memory loss', 600, 611)], 'diseases_or_medical_conditions': [], 'medications': [('Lipitor', 460, 467)], 'clinical_findings': [], 'symptoms_experienced_by_patients': []}
Output sample 0 model fine_tuned_gpt_4o: {'adverse_drug_reactions': [('dry mouth', 9, 18), ('severe joint aches', 20, 38), ('muscle cramps in lower legs', 40, 68), ('sever swelling of feet, hands and ankles', 70, 111), ('difficulty breathing', 113, 132), ('tired', 134, 139), ('memory loss', 141, 152), ('mild depression', 154, 170), ('diminished eyesight', 172, 191), ('stamina is b



Output sample 60 model gliner: {'adverse_drug_reactions': [], 'diseases_or_medical_conditions': [], 'medications': [('statin drugs', 89, 101), ('statins', 371, 378), ('statins', 789, 796), ('statins', 1548, 1555), ('statins', 1607, 1614)], 'clinical_findings': [], 'symptoms_experienced_by_patients': [('muscle cramps', 304, 317), ('bad muscle cramps', 1561, 1578)]}
Expected output sample 61: {'adverse_drug_reactions': [('Calf muscle strains', 0, 19), ('muscle strain', 158, 171), ('severe muscle pull/strain', 226, 251), ('calf pain', 370, 379), ('pulled/strained calf', 557, 577), ('muscle pull', 649, 660)], 'diseases_or_medical_conditions': [], 'medications': [('lipitor', 358, 365), ('Lipitor', 418, 425), ('Lipitor', 598, 605)], 'clinical_findings': [], 'symptoms_experienced_by_patients': []}
Output sample 61 model fine_tuned_gpt_4o: {'adverse_drug_reactions': [('muscle strain', 201, 214), ('severe muscle pull/strain', 264, 289), ('calf pain', 423, 432), ('pulled/strained calf', 608, 628

In [30]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
df.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results.xlsx'), index=False)

In [38]:
df

Unnamed: 0,sample_id,model_name,expected_output,output
0,0,fine_tuned_gpt_4o,{'adverse_drug_reactions': [('Extremely dry mo...,"{'adverse_drug_reactions': [('dry mouth', 9, 1..."
1,0,fine_tuned_gpt_4o_v2,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [('Extremely dry mo...
2,0,fine_tuned_gpt_4o_mini,{'adverse_drug_reactions': [('Extremely dry mo...,"{'adverse_drug_reactions': [('dry mouth', 10, ..."
3,0,gpt_4o,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [{'name': 'Extremel...
4,0,sonnet_35,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [{'name': 'Extremel...
...,...,...,...,...
571,99,fine_tuned_gpt_4o_v2,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [('short term memor...
572,99,fine_tuned_gpt_4o_mini,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [('short term memor...
573,99,gpt_4o,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...
574,99,sonnet_35,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...


In [5]:
def clean_json_string(json_string):
    # Reemplazar comillas simples en medio de palabras o números (por ejemplo, "can't" -> "cant")
    cleaned_string = re.sub(r"(?<=\w)'(?=\w)", "", json_string)
    return cleaned_string

In [6]:
# Función para validar el valor de las variables
def validate_conditions(row):
    condition_1 = row['expected_entities'] >= row['correctly_detected_entities'] >= row['correctly_detected_and_classified_entities']
    condition_2 = row['correctly_detected_entities'] >= row['within_10_matches_start_pos'] >= row['within_5_matches_start_pos'] >= row['within_2_matches_start_pos'] >= row['exact_matches_start_pos']
    condition_3 = row['correctly_detected_entities'] >= row['within_10_matches_end_pos'] >= row['within_5_matches_end_pos'] >= row['within_2_matches_end_pos'] >= row['exact_matches_end_pos']
    
    return condition_1 and condition_2 and condition_3

In [8]:
# Preprocessing function to extract the entities from the dictionary-like structures in the expected_output and output columns
def extract_entities(entity_data):
    """
    Extracts entity names from the structured dictionary data.
    Expected input is in the format {'entity_type': [('entity_name', start_index, end_index), ...]}
    """
    if isinstance(entity_data, str):
        # Convert string representation of dictionary to actual dictionary
        try:
            entity_data = eval(entity_data)
        except:
            return set()  # Return empty set if parsing fails

    entities = set()
    for entity_type, entity_list in entity_data.items():
        for entity in entity_list:
            # Check if the entity is a tuple (with name and positions) or a dict (with name)
            if isinstance(entity, tuple) and len(entity) > 0:
                entities.add(entity[0])
            elif isinstance(entity, dict) and 'name' in entity:
                entities.add(entity['name'])

    return entities


In [15]:
# Function to calculate TP, FP, FN, and TN for each sample
def calculate_tp_fp_fn(row):
    true_entities = row['expected_entities']
    predicted_entities = row['predicted_entities']
    
    tp = true_entities & predicted_entities  # True Positives: entities in both expected and predicted
    fp = predicted_entities - true_entities  # False Positives: entities predicted but not expected
    fn = true_entities - predicted_entities  # False Negatives: entities expected but not predicted
    tn = set()  # True Negatives are not typically used in NER, but we include an empty set for clarity
    
    # TN is theoretically the case where neither true nor predicted have entities, we'll represent it for completeness
    if len(true_entities) == 0 and len(predicted_entities) == 0:
        tn = {"No Entities"}
    
    return tp, fp, fn, tn

In [9]:
df_processed = pd.read_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results.xlsx'))

In [10]:
# Apply the entity extraction to both the expected_output and output columns
df_processed['expected_entities'] = df_processed['expected_output'].apply(extract_entities)
df_processed['predicted_entities'] = df_processed['output'].apply(extract_entities)

In [16]:
# Apply the calculation for each row in the DataFrame
df_processed['TP'], df_processed['FP'], df_processed['FN'], df_processed['TN'] = zip(*df_processed.apply(calculate_tp_fp_fn, axis=1))

In [18]:
# Initialize lists to store precision, recall, and F1 scores for each row
precision_list = []
recall_list = []
f1_list = []

# Iterate through each row and calculate precision, recall, and F1-score
for _, row in df_processed.iterrows():
    true_entities = row['expected_entities']
    predicted_entities = row['predicted_entities']
    
    # Calculate precision, recall, and F1-score using sklearn function
    true_positive = len(true_entities & predicted_entities)
    false_positive = len(predicted_entities - true_entities)
    false_negative = len(true_entities - predicted_entities)
    
    # Handle cases where there are no true entities
    if len(true_entities) == 0 and len(predicted_entities) == 0:
        precision, recall, f1 = 1.0, 1.0, 1.0  # Perfect score if both are empty
    elif len(predicted_entities) == 0:
        precision, recall, f1 = 0.0, 0.0, 0.0  # Zero score if prediction is empty and there are true entities
    else:
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Add these metrics to the dataframe for each sample
df_processed['precision'] = precision_list
df_processed['recall'] = recall_list
df_processed['f1_score'] = f1_list

In [21]:
# Convertir los valores de las columnas 'expected_output' y 'output' de cadenas a diccionarios (si no lo están ya)
df_processed['expected_output'] = df_processed['expected_output'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_processed['output'] = df_processed['output'].apply(lambda x: ast.literal_eval(clean_json_string(x)) if isinstance(x, str) else x)

In [22]:
# Aplicar evaluate_model_output_entities y almacenar los resultados en nuevas columnas
df_processed[['expected_entities', 'correctly_detected_entities', 'correctly_detected_and_classified_entities']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_entities(row['expected_output'], row['output'])),
    axis=1
)

In [23]:
# Aplicar evaluate_model_output_start_positions y almacenar los resultados en nuevas columnas
df_processed[['exact_matches_start_pos', 'within_2_matches_start_pos', 'within_5_matches_start_pos', 'within_10_matches_start_pos']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_start_positions(row['expected_output'], row['output'])),
    axis=1
)

In [24]:
# Aplicar evaluate_model_output_end_positions y almacenar los resultados en nuevas columnas
df_processed[['exact_matches_end_pos', 'within_2_matches_end_pos', 'within_5_matches_end_pos', 'within_10_matches_end_pos']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_end_positions(row['expected_output'], row['output'])),
    axis=1
)

In [25]:
df_processed['valid_values'] = df_processed.apply(validate_conditions, axis=1)

In [26]:
df_processed[df_processed['valid_values'] == False] # Si no se muestra nada, entonces todos los valores son válidos

Unnamed: 0,sample_id,model_name,expected_output,output,expected_entities,predicted_entities,TP,FP,FN,TN,...,correctly_detected_and_classified_entities,exact_matches_start_pos,within_2_matches_start_pos,within_5_matches_start_pos,within_10_matches_start_pos,exact_matches_end_pos,within_2_matches_end_pos,within_5_matches_end_pos,within_10_matches_end_pos,valid_values


In [27]:
df_processed

Unnamed: 0,sample_id,model_name,expected_output,output,expected_entities,predicted_entities,TP,FP,FN,TN,...,correctly_detected_and_classified_entities,exact_matches_start_pos,within_2_matches_start_pos,within_5_matches_start_pos,within_10_matches_start_pos,exact_matches_end_pos,within_2_matches_end_pos,within_5_matches_end_pos,within_10_matches_end_pos,valid_values
0,0,gpt_4o_mini,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [{'name': 'dry mout...,15,"{swelling of feet, diminished eyesight, swelli...","{muscle cramps in lower legs, difficulty breat...","{swelling of feet, muscle cramps, dry mouth, j...","{forgetfulness, severe joint aches, swelling, ...",{},...,5,1,2,3,6,0,2,3,6,True
1,1,gpt_4o_mini,{'adverse_drug_reactions': [('Nagging muscle p...,{'adverse_drug_reactions': [{'name': 'adverse ...,4,"{20mg/day, persistent fatigue, Nagging muscle ...","{persistent fatigue, moderate insomnia}","{Nagging muscle pain, 20mg/day, increasingly u...",{Nagging muscle pain between and just below my...,{},...,0,2,2,2,2,2,2,2,2,True
2,2,gpt_4o_mini,{'adverse_drug_reactions': [('severe weight lo...,{'adverse_drug_reactions': [{'name': 'severe w...,9,"{loss of strength, arthritis, Lipitor, muscle ...","{loss of strength, arthritis, Lipitor, muscle ...",{},{},{},...,5,0,1,2,3,0,0,2,3,True
3,3,gpt_4o_mini,{'adverse_drug_reactions': [('Extreme lower ba...,"{'adverse_drug_reactions': [{'name': 'pain', '...",7,"{overexerted, over stressed, economic and phys...","{pain in neck and shoulders, over stressed, re...","{economic and physical damage, overexerted, re...",{Extreme lower back pain},{},...,2,0,1,1,1,0,0,1,1,True
4,4,gpt_4o_mini,"{'adverse_drug_reactions': [('legs ached', 46,...","{'adverse_drug_reactions': [], 'diseases_or_me...",5,"{headaches, calf muscles painful to touch, ext...","{headaches, extreme fatigue, legs ached, calf ...",{},{},{},...,0,0,0,0,0,0,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,95,fine_tuned_gpt_4o_mini_v2,"{'adverse_drug_reactions': [('foot pain', 29, ...","{'adverse_drug_reactions': [('foot pain', 28, ...",8,"{leg cramps, foot pain, hammer toe, severe bur...","{leg cramps, foot pain, hammer toe, severe bur...",{},{},{},...,6,2,3,5,7,2,3,5,7,True
764,96,fine_tuned_gpt_4o_mini_v2,"{'adverse_drug_reactions': [], 'diseases_or_me...",{'adverse_drug_reactions': [('elevated liver e...,23,"{nausea, aches, generalized skin discoloration...","{nausea, Malaise, generalized skin discolorati...","{aches, muscle pain, weakness, kidney failure}",{acute kidney failure},{},...,5,0,0,6,7,0,0,6,7,True
765,97,fine_tuned_gpt_4o_mini_v2,"{'adverse_drug_reactions': [('blurred vision',...","{'adverse_drug_reactions': [('Fuzzy thinking',...",4,"{Fuzzy thinking, blurred vision, Red Yeast Rice}","{Fuzzy thinking, blurred vision, Red Yeast Rice}",{},{pain},{},...,3,2,2,2,2,2,2,2,2,True
766,98,fine_tuned_gpt_4o_mini_v2,"{'adverse_drug_reactions': [('stomach pain', 0...","{'adverse_drug_reactions': [('stomach pain', 0...",10,"{stomach pain, Lipitor, dizzy spells, fatigue,...","{stomach pain, Lipitor, dizzy spells, fatigue,...",{},{},{},...,10,6,6,8,8,6,6,8,8,True


In [28]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
df_processed.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results_processed.xlsx'), index=False)

In [30]:
# Función para calcular las estadísticas para cada modelo
def calculate_statistics(df_processed):
    stats = {}
    
    # Calcular las estadísticas basadas en las columnas
    stats['%_correctly_detected_entities'] = (df_processed['correctly_detected_entities'].sum() / df_processed['expected_entities'].sum()) * 100
    stats['%_correctly_detected_and_classified_entities'] = (df_processed['correctly_detected_and_classified_entities'].sum() / df_processed['expected_entities'].sum()) * 100
    
    # Para las posiciones iniciales
    if df_processed['correctly_detected_entities'].sum() > 0:
        stats['%_exact_matches_start_pos'] = (df_processed['exact_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_2_matches_start_pos'] = (df_processed['within_2_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_5_matches_start_pos'] = (df_processed['within_5_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_10_matches_start_pos'] = (df_processed['within_10_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
    else:
        stats['%_exact_matches_start_pos'] = 0
        stats['%_within_2_matches_start_pos'] = 0
        stats['%_within_5_matches_start_pos'] = 0
        stats['%_within_10_matches_start_pos'] = 0

    # Para las posiciones finales
    if df_processed['correctly_detected_entities'].sum() > 0:
        stats['%_exact_matches_end_pos'] = (df_processed['exact_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_2_matches_end_pos'] = (df_processed['within_2_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_5_matches_end_pos'] = (df_processed['within_5_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_10_matches_end_pos'] = (df_processed['within_10_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
    else:
        stats['%_exact_matches_end_pos'] = 0
        stats['%_within_2_matches_end_pos'] = 0
        stats['%_within_5_matches_end_pos'] = 0
        stats['%_within_10_matches_end_pos'] = 0
        
    stats['precision'] = df_processed['precision'].mean()
    stats['recall'] = df_processed['recall'].mean()
    stats['f1_score'] = df_processed['f1_score'].mean()

    return stats

In [31]:
# Crear un DataFrame de estadísticas para cada modelo
model_names = df_processed['model_name'].unique()
statistics_list = []

for model in model_names:
    model_df_processed = df_processed[df_processed['model_name'] == model]
    stats = calculate_statistics(model_df_processed)
    stats['model_name'] = model
    statistics_list.append(stats)

# Convertir las estadísticas en un nuevo DataFrame
statistics_df_processed = pd.DataFrame(statistics_list)

# Reordenar las columnas con 'model_name' primero
statistics_df_processed = statistics_df_processed[['model_name'] + [col for col in statistics_df_processed.columns if col != 'model_name']]


In [33]:
# Reordenar las columnas con 'model_name' primero y 'precision', 'recall' y 'f1_score' seguidas y luego el resto
statistics_df_processed = statistics_df_processed[['model_name', 'precision', 'recall', 'f1_score'] + [col for col in statistics_df_processed.columns if col not in ['model_name', 'precision', 'recall', 'f1_score']]]

In [34]:
statistics_df_processed

Unnamed: 0,model_name,precision,recall,f1_score,%_correctly_detected_entities,%_correctly_detected_and_classified_entities,%_exact_matches_start_pos,%_within_2_matches_start_pos,%_within_5_matches_start_pos,%_within_10_matches_start_pos,%_exact_matches_end_pos,%_within_2_matches_end_pos,%_within_5_matches_end_pos,%_within_10_matches_end_pos
0,gpt_4o_mini,0.515365,0.67071,0.568298,70.962733,41.925466,10.940919,18.161926,22.538293,28.446389,4.814004,15.536105,21.444201,28.008753
1,fine_tuned_gpt_4o,0.680562,0.697716,0.681654,73.757764,66.770186,20.0,38.315789,49.473684,61.684211,17.263158,37.684211,49.894737,61.473684
2,fine_tuned_gpt_4o_v2,0.833682,0.821084,0.819199,82.298137,75.931677,25.09434,48.301887,58.301887,74.528302,24.339623,48.301887,58.301887,74.528302
3,fine_tuned_gpt_4o_mini,0.619001,0.635479,0.611822,65.838509,57.919255,14.150943,28.066038,36.084906,42.924528,12.5,27.59434,35.849057,42.924528
4,gpt_4o,0.674322,0.723366,0.687187,72.981366,56.832298,21.914894,32.765957,40.638298,50.425532,18.723404,32.340426,40.425532,49.787234
5,sonnet_35,0.63757,0.770159,0.681906,77.639752,67.701863,18.6,39.8,55.8,65.8,15.6,39.6,55.6,65.8
6,gliner,0.653251,0.447642,0.506186,51.242236,20.341615,98.787879,98.787879,98.787879,99.090909,98.787879,98.787879,98.787879,99.090909
7,fine_tuned_gpt_4o_mini_v2,0.86663,0.864565,0.858531,84.782609,76.242236,20.32967,37.179487,53.113553,63.369963,19.59707,37.179487,53.113553,63.369963


In [35]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
statistics_df_processed.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_statistics.xlsx'), index=False)