In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import json
import os
import sys
import pandas as pd
import ast
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import threading

In [42]:
sys.path.append(os.path.join(os.getcwd(), '..'))

In [43]:
from evaluation_metrics import get_user_content, get_assistant_content, evaluate_model_output_entities, evaluate_model_output_start_positions, evaluate_model_output_end_positions
from models.fine_tuning_ner import apply_ner_to_text_fine_tuned
from models.prompting_ner import apply_ner_to_text_openai, apply_ner_to_text_anthropic
from models.gliner_ner import apply_ner_to_text_gliner

In [44]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(processName)s %(message)s',
                    filename='logs_output.log',  # Nombre del fichero
                    filemode='a')  # 'a' para añadir al fichero, 'w' para sobreescribir

## Obtain results of the NER alternatives:

In [45]:
# Ruta al archivo JSONL
file_path = os.path.join(os.getcwd(), '..', '..', '..', 'resources', 'evaluation', 'validation_in_production_data.jsonl')        

In [46]:
empty_result = {
    'adverse_drug_reactions': [], 
    'diseases_or_medical_conditions': [], 
    'medications': [], 
    'clinical_findings': [], 
    'symptoms_experienced_by_patients': []
}

In [47]:

ner_alternatives = [
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o', 'ft:gpt-4o-2024-08-06:personal:150-sample-gpt-4o:A34xm7HX'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_v2', 'ft:gpt-4o-2024-08-06:personal:1000-sample-gpt-4o:A3S2iooJ'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_mini', 'ft:gpt-4o-mini-2024-07-18:personal:100-sample-gpt-4o-mini:A34sYctP'),
    (apply_ner_to_text_fine_tuned, 'fine_tuned_gpt_4o_mini_v2', 'ft:gpt-4o-mini-2024-07-18:personal:1000-sample-gpt-4o-mini:A3sZemiq'),
    (apply_ner_to_text_openai, 'gpt_4o', 'gpt-4o-2024-08-06'),
    (apply_ner_to_text_openai, 'gpt_4o_mini', 'gpt-4o-mini'),
    (apply_ner_to_text_anthropic, 'sonnet_35', 'claude-3-5-sonnet-20240620'),
    (apply_ner_to_text_gliner, 'gliner', None)
]

In [48]:
MAX_RETRIES = 1
CONCURRENT_REQUESTS = 20

semaphore = threading.Semaphore(CONCURRENT_REQUESTS)

def process_task(args):
    try:
        with semaphore:
            idx, line, function, model_name, model_id = args
            
            data = json.loads(line.strip())
            input_text = get_user_content(data)
            expected_output = get_assistant_content(data)
            expected_output_dict = ast.literal_eval(expected_output.replace('\"', '\''))
            logging.info(f"Expected output sample {idx}: {expected_output_dict}")

            try:
                # Aplica la función alternativa
                output_dict = function(input_text, model_id)
                if model_name in ['gpt_4o', 'gpt_4o_mini']:
                    output_dict = ast.literal_eval(output_dict.replace('\"', '\''))
                logging.info(f"Output sample {idx} model {model_name}: {output_dict}")

                # Prepara la fila de resultado
                result = {
                    'model_name': model_name,
                    'sample_id': idx,
                    'expected_output': expected_output_dict,
                    'output': output_dict,
                }
                return result

            except Exception as e:
                logging.error(f"Error sample {idx}: {e}")
                logging.error(f"Expected output sample {idx}: {expected_output}")
                output_dict_str = output_dict if 'output_dict' in locals() else None
                logging.error(f"Output sample {idx}: {output_dict_str}")
                result = {
                    'model_name': model_name,
                    'sample_id': idx,
                    'expected_output': empty_result,
                    'output': empty_result,
                }
                return result
    except Exception as e:
        logging.error(f"Error processing sample {idx}: {e}")
        return None

In [49]:
df = pd.DataFrame(columns=['sample_id', 'model_name', 'expected_output', 'output'])
tasks = []

# Lee las líneas del archivo
with open(file_path, 'r', encoding='utf-8') as file:
    for idx, line in enumerate(file):
        for function, model_name, model_id in ner_alternatives:
            tasks.append((idx, line, function, model_name, model_id))

print(f"Processing {len(tasks)} tasks.")

# Utiliza ThreadPoolExecutor para procesar tareas en paralelo
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_task = {executor.submit(process_task, task): task for task in tasks}
    results = []
    for future in as_completed(future_to_task):
        try:
            res = future.result()
            if res is not None:
                results.append(res)
        except Exception as e:
            task = future_to_task[future]
            logging.error(f"Task {task} generated an exception: {e}")

# Crea un DataFrame con los resultados
if results:
    df = pd.DataFrame(results)
else:
    print("No results were processed successfully.")

Processing 800 tasks.


  res = future.result()


In [50]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
df.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results.xlsx'), index=False)

In [51]:
df

Unnamed: 0,model_name,sample_id,expected_output,output
0,fine_tuned_gpt_4o_mini,0,{'adverse_drug_reactions': [('Extremely dry mo...,"{'adverse_drug_reactions': [('dry mouth', 9, 1..."
1,fine_tuned_gpt_4o,1,{'adverse_drug_reactions': [('Nagging muscle p...,{'adverse_drug_reactions': [('Nagging muscle p...
2,fine_tuned_gpt_4o_mini_v2,0,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [('Extremely dry mo...
3,fine_tuned_gpt_4o_v2,1,{'adverse_drug_reactions': [('Nagging muscle p...,{'adverse_drug_reactions': [('Nagging muscle p...
4,fine_tuned_gpt_4o_v2,0,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [('Extremely dry mo...
...,...,...,...,...
763,sonnet_35,98,"{'adverse_drug_reactions': [('stomach pain', 0...",{'adverse_drug_reactions': [{'name': 'stomach ...
764,gpt_4o_mini,99,{'adverse_drug_reactions': [('short term memor...,"{'adverse_drug_reactions': [{'name': 'tired', ..."
765,gpt_4o,99,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...
766,sonnet_35,99,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...


In [52]:
def clean_json_string(json_string):
    # Reemplazar comillas simples en medio de palabras o números (por ejemplo, "can't" -> "cant")
    cleaned_string = re.sub(r"(?<=\w)'(?=\w)", "", json_string)
    return cleaned_string

In [53]:
# Función para validar el valor de las variables
def validate_conditions(row):
    condition_1 = row['expected_entities'] >= row['correctly_detected_entities'] >= row['correctly_detected_and_classified_entities']
    condition_2 = row['correctly_detected_entities'] >= row['within_10_matches_start_pos'] >= row['within_5_matches_start_pos'] >= row['within_2_matches_start_pos'] >= row['exact_matches_start_pos']
    condition_3 = row['correctly_detected_entities'] >= row['within_10_matches_end_pos'] >= row['within_5_matches_end_pos'] >= row['within_2_matches_end_pos'] >= row['exact_matches_end_pos']
    
    return condition_1 and condition_2 and condition_3

In [54]:
# Preprocessing function to extract the entities from the dictionary-like structures in the expected_output and output columns
def extract_entities(entity_data):
    """
    Extracts entity names from the structured dictionary data.
    Expected input is in the format {'entity_type': [('entity_name', start_index, end_index), ...]}
    """
    if isinstance(entity_data, str):
        # Convert string representation of dictionary to actual dictionary
        try:
            entity_data = eval(entity_data)
        except:
            return set()  # Return empty set if parsing fails

    entities = set()
    for entity_type, entity_list in entity_data.items():
        for entity in entity_list:
            # Check if the entity is a tuple (with name and positions) or a dict (with name)
            if isinstance(entity, tuple) and len(entity) > 0:
                entities.add(entity[0])
            elif isinstance(entity, dict) and 'name' in entity:
                entities.add(entity['name'])

    return entities


In [55]:
# Function to calculate TP, FP, FN, and TN for each sample
def calculate_tp_fp_fn(row):
    true_entities = row['expected_entities']
    predicted_entities = row['predicted_entities']
    
    tp = true_entities & predicted_entities  # True Positives: entities in both expected and predicted
    fp = predicted_entities - true_entities  # False Positives: entities predicted but not expected
    fn = true_entities - predicted_entities  # False Negatives: entities expected but not predicted
    tn = set()  # True Negatives are not typically used in NER, but we include an empty set for clarity
    
    # TN is theoretically the case where neither true nor predicted have entities, we'll represent it for completeness
    if len(true_entities) == 0 and len(predicted_entities) == 0:
        tn = {"No Entities"}
    
    return tp, fp, fn, tn

In [56]:
df_processed = pd.read_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results.xlsx'))

In [57]:
# Apply the entity extraction to both the expected_output and output columns
df_processed['expected_entities'] = df_processed['expected_output'].apply(extract_entities)
df_processed['predicted_entities'] = df_processed['output'].apply(extract_entities)

In [58]:
# Apply the calculation for each row in the DataFrame
df_processed['TP'], df_processed['FP'], df_processed['FN'], df_processed['TN'] = zip(*df_processed.apply(calculate_tp_fp_fn, axis=1))

In [59]:
# Initialize lists to store precision, recall, and F1 scores for each row
precision_list = []
recall_list = []
f1_list = []

# Iterate through each row and calculate precision, recall, and F1-score
for _, row in df_processed.iterrows():
    true_entities = row['expected_entities']
    predicted_entities = row['predicted_entities']
    
    # Calculate precision, recall, and F1-score using sklearn function
    true_positive = len(true_entities & predicted_entities)
    false_positive = len(predicted_entities - true_entities)
    false_negative = len(true_entities - predicted_entities)
    
    # Handle cases where there are no true entities
    if len(true_entities) == 0 and len(predicted_entities) == 0:
        precision, recall, f1 = 1.0, 1.0, 1.0  # Perfect score if both are empty
    elif len(predicted_entities) == 0:
        precision, recall, f1 = 0.0, 0.0, 0.0  # Zero score if prediction is empty and there are true entities
    else:
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Add these metrics to the dataframe for each sample
df_processed['precision'] = precision_list
df_processed['recall'] = recall_list
df_processed['f1_score'] = f1_list

In [60]:
# Convertir los valores de las columnas 'expected_output' y 'output' de cadenas a diccionarios (si no lo están ya)
df_processed['expected_output'] = df_processed['expected_output'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_processed['output'] = df_processed['output'].apply(lambda x: ast.literal_eval(clean_json_string(x)) if isinstance(x, str) else x)

In [61]:
# Aplicar evaluate_model_output_entities y almacenar los resultados en nuevas columnas
df_processed[['expected_entities', 'correctly_detected_entities', 'correctly_detected_and_classified_entities']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_entities(row['expected_output'], row['output'])),
    axis=1
)

In [62]:
# Aplicar evaluate_model_output_start_positions y almacenar los resultados en nuevas columnas
df_processed[['exact_matches_start_pos', 'within_2_matches_start_pos', 'within_5_matches_start_pos', 'within_10_matches_start_pos']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_start_positions(row['expected_output'], row['output'])),
    axis=1
)

In [63]:
# Aplicar evaluate_model_output_end_positions y almacenar los resultados en nuevas columnas
df_processed[['exact_matches_end_pos', 'within_2_matches_end_pos', 'within_5_matches_end_pos', 'within_10_matches_end_pos']] = df_processed.apply(
    lambda row: pd.Series(evaluate_model_output_end_positions(row['expected_output'], row['output'])),
    axis=1
)

In [64]:
df_processed['valid_values'] = df_processed.apply(validate_conditions, axis=1)

In [65]:
df_processed[df_processed['valid_values'] == False] # Si no se muestra nada, entonces todos los valores son válidos

Unnamed: 0,model_name,sample_id,expected_output,output,expected_entities,predicted_entities,TP,FP,FN,TN,...,correctly_detected_and_classified_entities,exact_matches_start_pos,within_2_matches_start_pos,within_5_matches_start_pos,within_10_matches_start_pos,exact_matches_end_pos,within_2_matches_end_pos,within_5_matches_end_pos,within_10_matches_end_pos,valid_values


In [68]:
df_processed

Unnamed: 0,model_name,sample_id,expected_output,output,expected_entities,predicted_entities,TP,FP,FN,TN,...,correctly_detected_and_classified_entities,exact_matches_start_pos,within_2_matches_start_pos,within_5_matches_start_pos,within_10_matches_start_pos,exact_matches_end_pos,within_2_matches_end_pos,within_5_matches_end_pos,within_10_matches_end_pos,valid_values
0,fine_tuned_gpt_4o_mini,0,{'adverse_drug_reactions': [('Extremely dry mo...,"{'adverse_drug_reactions': [('dry mouth', 9, 1...",15,"{diminished eyesight, muscle aches, memory los...","{muscle aches, memory loss, joint pain, diffic...","{breathing is easier, dry mouth, swelling of f...","{Extremely dry mouth, mild depression, swellin...",{},...,6,0,0,1,1,0,0,1,1,True
1,fine_tuned_gpt_4o,1,{'adverse_drug_reactions': [('Nagging muscle p...,{'adverse_drug_reactions': [('Nagging muscle p...,4,"{moderate insomnia, persistent fatigue, unable...","{moderate insomnia, persistent fatigue, unable...",{Nagging muscle pain},{Nagging muscle pain between and just below my...,{},...,3,0,0,1,2,0,0,1,2,True
2,fine_tuned_gpt_4o_mini_v2,0,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [('Extremely dry mo...,15,"{Extremely dry mouth, forgetfulness, muscle ac...","{Extremely dry mouth, tired, muscle aches, mem...",{muscle cramps},{muscle cramps in lower legs},{},...,14,2,2,7,7,2,2,7,7,True
3,fine_tuned_gpt_4o_v2,1,{'adverse_drug_reactions': [('Nagging muscle p...,{'adverse_drug_reactions': [('Nagging muscle p...,4,"{persistent fatigue, moderate insomnia, Naggin...","{persistent fatigue, moderate insomnia, Naggin...",{unable to focus or stay},{unable to focus},{},...,3,3,3,3,3,3,3,3,3,True
4,fine_tuned_gpt_4o_v2,0,{'adverse_drug_reactions': [('Extremely dry mo...,{'adverse_drug_reactions': [('Extremely dry mo...,15,"{Extremely dry mouth, sever swelling of feet, ...","{Extremely dry mouth, tired, muscle aches, mem...","{swelling decreased, sever swelling of feet, h...",{swelling},{},...,14,3,8,8,8,3,8,8,8,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,sonnet_35,98,"{'adverse_drug_reactions': [('stomach pain', 0...",{'adverse_drug_reactions': [{'name': 'stomach ...,10,"{hairloss, dizzy spells, dry eyes, joint pain,...","{hairloss, dizzy spells, dry eyes, joint pain,...",{},{},{},...,10,6,7,7,7,6,7,7,7,True
764,gpt_4o_mini,99,{'adverse_drug_reactions': [('short term memor...,"{'adverse_drug_reactions': [{'name': 'tired', ...",10,"{Tricor, extreme fatigue, reflux, memory is ba...","{extreme fatigue, reflux, Tricor, fatigue, sho...","{memory is back, joint pain, killer drug, no f...",{},{},...,2,0,0,0,0,0,0,0,0,True
765,gpt_4o,99,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...,10,"{extreme fatigue, reflux, Tricor, joint pain, ...","{extreme fatigue, reflux, Tricor, short term m...","{joint pain, making me tired, knees still hurt}",{fatigue},{},...,8,1,1,3,4,1,1,3,4,True
766,sonnet_35,99,{'adverse_drug_reactions': [('short term memor...,{'adverse_drug_reactions': [{'name': 'short te...,10,"{extreme fatigue, reflux, Tricor, STATIN, join...","{extreme fatigue, reflux, Tricor, fatigue, sho...","{joint pain, STATIN, making me tired, knees st...",{},{},...,10,0,1,3,5,0,1,3,5,True


In [66]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
df_processed.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_results_processed.xlsx'), index=False)

In [69]:
# Función para calcular las estadísticas para cada modelo
def calculate_statistics(df_processed):
    stats = {}
    
    # Calcular las estadísticas basadas en las columnas
    stats['%_correctly_detected_entities'] = (df_processed['correctly_detected_entities'].sum() / df_processed['expected_entities'].sum()) * 100
    stats['%_correctly_detected_and_classified_entities'] = (df_processed['correctly_detected_and_classified_entities'].sum() / df_processed['expected_entities'].sum()) * 100
    
    # Para las posiciones iniciales
    if df_processed['correctly_detected_entities'].sum() > 0:
        stats['%_exact_matches_start_pos'] = (df_processed['exact_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_2_matches_start_pos'] = (df_processed['within_2_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_5_matches_start_pos'] = (df_processed['within_5_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_10_matches_start_pos'] = (df_processed['within_10_matches_start_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
    else:
        stats['%_exact_matches_start_pos'] = 0
        stats['%_within_2_matches_start_pos'] = 0
        stats['%_within_5_matches_start_pos'] = 0
        stats['%_within_10_matches_start_pos'] = 0

    # Para las posiciones finales
    if df_processed['correctly_detected_entities'].sum() > 0:
        stats['%_exact_matches_end_pos'] = (df_processed['exact_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_2_matches_end_pos'] = (df_processed['within_2_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_5_matches_end_pos'] = (df_processed['within_5_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
        stats['%_within_10_matches_end_pos'] = (df_processed['within_10_matches_end_pos'].sum() / df_processed['correctly_detected_entities'].sum()) * 100
    else:
        stats['%_exact_matches_end_pos'] = 0
        stats['%_within_2_matches_end_pos'] = 0
        stats['%_within_5_matches_end_pos'] = 0
        stats['%_within_10_matches_end_pos'] = 0
        
    stats['precision'] = df_processed['precision'].mean()
    stats['recall'] = df_processed['recall'].mean()
    stats['f1_score'] = df_processed['f1_score'].mean()

    return stats

In [70]:
# Crear un DataFrame de estadísticas para cada modelo
model_names = df_processed['model_name'].unique()
statistics_list = []

for model in model_names:
    model_df_processed = df_processed[df_processed['model_name'] == model]
    stats = calculate_statistics(model_df_processed)
    stats['model_name'] = model
    statistics_list.append(stats)

# Convertir las estadísticas en un nuevo DataFrame
statistics_df_processed = pd.DataFrame(statistics_list)

# Reordenar las columnas con 'model_name' primero
statistics_df_processed = statistics_df_processed[['model_name'] + [col for col in statistics_df_processed.columns if col != 'model_name']]


In [71]:
# Reordenar las columnas con 'model_name' primero y 'precision', 'recall' y 'f1_score' seguidas y luego el resto
statistics_df_processed = statistics_df_processed[['model_name', 'precision', 'recall', 'f1_score'] + [col for col in statistics_df_processed.columns if col not in ['model_name', 'precision', 'recall', 'f1_score']]]

In [72]:
statistics_df_processed

Unnamed: 0,model_name,precision,recall,f1_score,%_correctly_detected_entities,%_correctly_detected_and_classified_entities,%_exact_matches_start_pos,%_within_2_matches_start_pos,%_within_5_matches_start_pos,%_within_10_matches_start_pos,%_exact_matches_end_pos,%_within_2_matches_end_pos,%_within_5_matches_end_pos,%_within_10_matches_end_pos
0,fine_tuned_gpt_4o_mini,0.650797,0.64552,0.636469,65.217391,56.521739,16.904762,28.095238,36.190476,45.952381,15.47619,27.619048,35.714286,45.952381
1,fine_tuned_gpt_4o,0.684381,0.698571,0.68089,72.670807,64.130435,19.871795,35.042735,50.854701,62.393162,17.735043,34.401709,50.854701,62.179487
2,fine_tuned_gpt_4o_mini_v2,0.856202,0.859985,0.853876,84.937888,75.776398,21.572212,34.552102,49.908592,61.42596,20.840951,34.369287,49.725777,61.243144
3,fine_tuned_gpt_4o_v2,0.810297,0.816349,0.804047,81.677019,75.621118,30.038023,45.81749,60.456274,69.011407,30.038023,45.81749,60.26616,69.011407
4,gpt_4o,0.761916,0.801688,0.768449,74.049217,45.637584,25.981873,39.577039,51.057402,59.214502,21.450151,37.76435,51.359517,59.214502
5,gliner,0.653251,0.447642,0.506186,51.242236,20.341615,98.787879,98.787879,98.787879,99.090909,98.787879,98.787879,98.787879,99.090909
6,gpt_4o_mini,0.523715,0.688957,0.573668,68.63354,39.596273,11.085973,18.099548,22.850679,28.280543,6.108597,16.742081,23.303167,27.149321
7,sonnet_35,0.631734,0.760698,0.673194,76.863354,67.236025,23.838384,38.989899,51.717172,62.222222,20.0,38.989899,51.717172,62.020202


In [73]:
if not os.path.exists(os.path.join(os.getcwd(), '..', '..', '..', 'results')):
    os.makedirs(os.path.join(os.getcwd(), '..', '..', '..', 'results'))
    
statistics_df_processed.to_excel(os.path.join(os.getcwd(), '..', '..', '..', 'results', 'ner_alternatives_statistics.xlsx'), index=False)