# Self Order Score

Algoritmo em quatro passos: (1) ideias vizinhas da mesma categoria são agrupadas -- a posição do grupo é associada à posição do primeiro elemento do mesmo; (2) grupos são ordenados conforme à posição do grupo no texto; (3) agrega grupos por categoria (cada grupo é agregado na primeira posição que aparece) e conta quantas trocas foram necessárias para cada agregação; (4) o total somado de todas as trocas é o índice `self order score`.

In [None]:
def order_keys(element):
  return element[1]

def self_order_groups(categories_order):
  # sort by text position (second element)
  categories_order.sort(key=order_keys) 
  # print(categories_order)

  # group by category (first element)
  # group = [category, position of the first group element, score]
  grouped = []
  for cat in range(1, 9):
    prev = -1
    prev_g = -1
    cat_g = None
    for i in range(0, len(categories_order)):
      if categories_order[i][0] == cat:
        # if any element in the previous position is not in the same category
        if prev == -1 or (categories_order[prev][0] != cat and \
           (prev_g == -1 or categories_order[prev][1] > categories_order[prev_g][1])):
          cat_g = [cat, categories_order[i][1], 1]  # new category grouping
          grouped.append(cat_g)
        else:
          cat_g[2] += 1
        prev_g = i
      # last distinct position in the sequence
      if i+1 == len(categories_order) or categories_order[i+1][1] != categories_order[i][1]:
        prev = i

  # sort groups by position (second element)
  grouped.sort(key=order_keys)

  return grouped

In [None]:
def self_order_score(categories_order):

  grouped = self_order_groups(categories_order)

  # score order change to group together categories
  subs = 0
  for cat in range(1, 9):
    prev = -1
    i = 0
    while i < len(grouped):
      if grouped[i][0] == cat:
        if prev == -1:
          prev = i
        else:
          subs += 1
          grouped[prev][2] += grouped[i][2]
          grouped = grouped[slice(0, i)] + grouped[slice(i+1, len(grouped))]
      i += 1

  return subs

In [None]:
print(self_order_groups([[1, 10], [2, 20], [1, 20], [2, 30]]))
print(self_order_groups([[1, 10], [2, 20], [1, 25], [2, 30]]))
print(self_order_groups([[1, 10], [2, 20], [1, 20], [3, 20], [1, 30], [2, 30], [3, 30]]))
print(self_order_groups([[2, 71], [2, 96], [3, 98], [2, 100], [5, 120], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(self_order_groups([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]]))
print(self_order_groups([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]]))

In [None]:
print(self_order_score([[1, 10], [2, 20], [1, 20], [2, 30]]))
print(self_order_score([[1, 10], [2, 20], [1, 25], [2, 30]]))
print(self_order_score([[1, 10], [2, 20], [1, 20], [3, 20], [1, 30], [2, 30], [3, 30]]))
print(self_order_score([[2, 71], [2, 96], [3, 98], [2, 100], [5, 120], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(self_order_score([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]]))
print(self_order_score([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]]))

In [None]:
'pressao',['patho',5],['physio',5],[''], 'pressao',['patho','history'], 'pressao',['patho','history']

# Clustering in Free Recall

Algoritmo de category clustering em free recall conforme descrito em https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3665324/

In [None]:
def clustering_free_recall(categories_order):
  n = len(categories_order)  # number of recalled items

  # sort by text position (second element)
  categories_order.sort(key=order_keys)

  nc = {}  # number of recalled items in each recalled category
  r = 0  # number of category repetition
  for i in range(0, len(categories_order)): 
    cat = categories_order[i][0]
    if not cat in nc:
      nc[cat] = 1
    else:
      nc[cat] += 1
    next_pos = i + 1
    # find next position of the same category or neighbor start
    while next_pos < len(categories_order) and \
          categories_order[next_pos][0] != cat and categories_order[next_pos][1] == categories_order[i][1]:
      next_pos += 1
    if next_pos < len(categories_order):
      sp = next_pos
      while sp < len(categories_order) and categories_order[sp][1] == categories_order[next_pos][1]:
        if cat == categories_order[sp][0]:
          r += 1
          break
        sp += 1

  c = len(nc)  # number of recalled categories
  max = n - c  # maximum possible number of category repetitions

  er = 0  # expected number of category repetitions
  for cat in nc:
    er += nc[cat] * nc[cat]
  er = er / n - 1

  # rr = r / (n - 1)  # ratio of repetition

  # mrr = r / max  # modified ratio of repetition

  # ds = r - er  # deviation score

  # adjusted ratio of clustering
  arc = ('' + str(r - er)) + ('/' + str(max - er)) if max - er == 0 else (r - er) / (max - er)

  return arc

In [None]:
print(clustering_free_recall([[2, 1], [4, 2], [4, 3], [3, 4], [2, 5], [3, 6], [1, 7], [4, 8], [4, 9]]))
print(clustering_free_recall([[3, 1], [4, 2], [4, 3], [3, 4], [1, 5], [1, 6], [3, 7], [1, 8], [1, 9], [2, 10], [2, 11], [2, 12], [4, 13], [4, 14], [3, 15]]))
print(clustering_free_recall([[2, 1], [2, 2], [3, 3], [1, 4], [1, 5], [1, 6], [1, 7], [2, 8], [3, 9], [3, 10], [2, 11], [1, 12], [4, 13], [4, 14], [4, 15], [4, 16], [2, 17], [2, 18], [3, 19], [1, 20]]))
print(clustering_free_recall([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]]))
print(clustering_free_recall([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(clustering_free_recall([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [7,98], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(clustering_free_recall([[2, 1], [5, 2], [2, 3], [3, 4], [1, 5], [5, 6]]))
print(clustering_free_recall([[1, 1], [1, 2], [1, 3], [2, 4]]))
print(clustering_free_recall([[7,1],[3,2],[7,3],[3,4],[2,5],[2,6]]))
print(clustering_free_recall([[1,1],[1,2],[5,3],[2,4],[8,5],[1,6],[1,7]]))
print(clustering_free_recall([[5,1], [5,2], [5,3]]))

# Transforming Llama answer for clustering and self order

In [None]:
%pip install fuzzywuzzy
%pip install levenshtein


In [None]:
import jsonlines
import json
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import jaccard_score, precision_score, recall_score, f1_score
import numpy as np
import re
import os
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import unicodedata
import ast

def get_examples(file_path):
    with jsonlines.open(file_path) as reader:
        data = [line for line in reader]
    return data
def get_from_annotated_dataset(annotated_dataset,_id):
    for doc in annotated_dataset:
        if doc['doc_id'] == _id:
            return doc
annotated_dataset = get_examples('../annotations-medical_specialist-dpoc-bio-composed-multiple.jsonl')
df_data_info = pd.read_csv('../annotations_medical_specialist_pre_processed.csv')

#REMOVE ADDITIONAL BAD DATA
# df_data_info.drop(df_data_info[df_data_info['doc_id'] == '8396380d-e0b6-4b81-8fe9-0b99c611f9f3'].index, inplace=True)
# df_data_info.reset_index(drop=True,inplace=True)
def replace_substring(string, start, end, replacement):
    # Check if start and end are valid indices for the string
    if start < 0 or end > len(string) or start > end:
        return "Invalid start or end index"

    # Replace the substring from start to end with the replacement string
    new_string = string[:start] + replacement + string[end:]

    return new_string

def add_quotes(item):
    if (not (item.startswith('"') and item.endswith('"')) and not (item.startswith("'") and item.endswith("'"))):
        if (item.startswith('"') and not item.endswith('"')) or (item.startswith("'") and not item.endswith("'")):
            item = item.replace('"', '\\"').replace("'", "\\'")
    return item
def fix_quotes(text):
    # Regular expression to find lists and their items   
    pattern = r"""\[\s*((["'][^"']+["'])(?:\s+[^,]+)?)\s*,\s*(\[\s*["'][^"']+?["'](?:\s*,\s*["'][^"']+?["'])*\s*\])\s*\]"""
    annotation_list = {"annotations":[]}
    for match in re.finditer(pattern, text):
        main_text = add_quotes(match.group(1))
        if main_text.startswith('"') and main_text.endswith('"') or (main_text.startswith("'") and main_text.endswith("'")):
            main_text = main_text[1:-1]
        current_list = [str(match.group(1)), ast.literal_eval(match.group(3))]
        current_list[0] = current_list[0][1:-1]
        annotation_list["annotations"].append(current_list)
    annotation_list = str(annotation_list)
    return annotation_list

def select_after_first_brace(string):
    pattern = r"(|[\'\"])(annotations)([\'\"]|)"
    
    matches = re.search(pattern, string)
    if matches != None:
        string = replace_substring(string, matches.span()[0], matches.span()[1], '"annotations"')
    string.replace("]\']",']]')
    string.replace("]\"]",']]')
    # print('STRING AFTER FIRST REGEX:', string)
    pattern = r'"annotations":\[.*\]\]'
    match = re.search(pattern, string)
    if match != None:
        string = replace_substring(string, match.span()[0], match.span()[1], '"annotations":[')
        string = match.group(0)
        # print('STRING AFTER SECOND REGEX:', string)

    brace_index = string.find('"annotations"')
    # print('trying to select correct part of models output')
    # print(brace_index)
    # print(string)
    # if brace_index == -1:
    string = fix_quotes(string)
    # print('strrrriiing\n\n')
    # print(string)
    brace_index = string.find('"annotations"')
    
    if string[-3:] == "']]":
        string = string[:-3]+"']]]"
    if string.find("}") == -1:
        end_annotation_index = string.find(']]]')
        if string[-1:] != ']' and end_annotation_index != -1:                
            string = string[:end_annotation_index+3]+'}'
        elif string[-1:] != ']' and end_annotation_index == -1:
            end_annotation_index = string.find(']] ]')
            if end_annotation_index != -1:
                string = string[:end_annotation_index+4]+'}'
            else:
                string = string+']}'
        elif string[-7:].count(']') < 3:
            string = string+']}'
        else:
            string = string+'}'
    else:
        string = string[:string.find("}")+1]
    # print('results...')
    # print(string)
    string = fix_quotes(string)
    return string

def escape_inner_quotes(text):
    # pattern = r"""(?<=:).*["'](.*)(?<!\\)["'](?=\s*[,}])"""
    pattern = r"""(?<=[:])[\s\S]*?["'](.*)(?<!\\)["'](?=\s*[}])"""
    matches = re.findall(pattern, text)
    # escaped_text = "'"+matches[0].replace('"', '\\"')+"'"
    escaped_text = re.sub(pattern, lambda m: "'" + m.group(1).replace('"', '\\"').replace("'", "\\'") + "'", text)
    return escaped_text
####### Function that extracts the entities and respective categories and transforms into a dictionary of annotations  #############
def transform_augmented_data_to_pattern(data_info):

    data_info = data_info.replace('```','')
    data_info = re.sub(r'(?<!["\\]):', r'\\:', data_info)
    data_info = data_info.replace("\n", " ").replace("  ", " ")
    
    data_info = re.sub(r"""["']*annotations["']*:""", r'"annotations":', data_info)
    ann_pattern = r""""annotations":\s*["']*{*(.*)(?=\s*["']*\s*[}]*)"""
    match = re.search(ann_pattern, data_info)
    if match:
        raw_data = data_info
        data_info = f'"annotations": "{match.group(1)}"'
        if data_info[-1:] == "'" or data_info[-1:] == '"':
            data_info = data_info[:-1]
    else:
        data_info = """ "annotations":{""} """
    closing_braces = data_info.find("}")
    opening_braces = data_info.find("{")
    if opening_braces == -1:
        data_info = '{'+ data_info
    elif data_info[:1] != '{':
        data_info = '{'+ data_info
    if closing_braces == -1:
        data_info = data_info[:len(data_info)] + '}'
        closing_braces = data_info.find("}")
    if (data_info.find('"}') == -1 and data_info.find('" }') == -1):
        data_info = data_info[:closing_braces] + '"}'
    data_info = escape_inner_quotes(data_info)
    # print('after escape inner quotes', data_info)
    if data_info[-3] == '\\':
        data_info = data_info[:len(data_info)-3]+data_info[-2:len(data_info)+1]
    data_info = ast.literal_eval(data_info)
    data_info['annotations'] = data_info['annotations'].replace('\\','')

    pattern = r'\[([^\[\]]+)\s*\|\s*([^\[\]]+)\]'
    matches = re.findall(pattern, data_info['annotations'])
    if len(matches) == 0:
        print("---------------------------------Error: No matches found in the data.--------------------------")
        print(data_info)
        annotations = {"annotations": []}
    else:
        annotations = {"annotations": [[match[0].strip(), [val.strip() for val in match[1].split(',')]] for match in matches]}
    return annotations

# prediction_annotation = eval(model_response[0])
def prediction_to_labels(prediction_labels, data_info):
    prediction_labels = transform_augmented_data_to_pattern(prediction_labels)
    if type(prediction_labels) == list:
        ze = prediction_labels[0]
        prediction_labels = ze
    prediction_labels = select_after_first_brace(prediction_labels)
    prediction_annotation = eval(prediction_labels)
    full_text = data_info['text']
    text_tokenized = data_info['labels']
    categorized_prediction = annotation_to_tokens(full_text, text_tokenized, prediction_annotation)
    labels = extract_labels_from_prediction(categorized_prediction)
    return labels
def truth_to_labels(data_info):
    labels = extract_labels_from_truth(data_info['labels'])
    return labels
def extract_labels_from_truth (data_info):
    text_tokenized = data_info
    categories = []
    for token in text_tokenized:
        if token[4] != None:
            categories.append(list(token[4].keys()))
        else:
            categories.append('0')
    return categories
def annotation_to_tokens (full_text, text_tokenized, prediction_annotation):
    clean_text_tokenized = [[token[0],token[1],token[2]] for token in text_tokenized]
    annotations = prediction_annotation['annotations']
    for annotation in annotations:
        # print('============= new annotation', annotation[0])
        start_pos = full_text.find(annotation[0])
        end_pos = len(annotation[0])+start_pos-1
        # print(f'end pos is {len(annotation)} + {start_pos} - 1 = {end_pos}')
        categorizing = False
        
        for token in clean_text_tokenized:
            # print(f'token pos {token[1]} annotation pos {start_pos} token {token[0]}')
            if token[1] == start_pos:
                # print('starting categorization...')
                # print(f'start pos {start_pos} end pos {end_pos} token {token[0]}')
                categorizing = True
            if categorizing:
                # adds category to token
                # print('====',annotation)
                # print(f'adding category {annotation[1]} to token {token[0]}')
                token.append(annotation[1])
                if token[2] == end_pos:
                    # print(f'ending categorization at {token[0]}...')
                    categorizing = False
                    break
            
            # print(token)
    return clean_text_tokenized
def extract_labels_from_prediction (categorized_prediction):
    labels = []
    for token in categorized_prediction:
        if len(token) > 3:
            labels.append(token[3])
        else:
            labels.append('0')
    return labels

def remove_upper_and_accents(text):
    text = text.lower()
    text = re.sub(r'[áàâãä]', 'a', text)
    text = re.sub(r'[éèêë]', 'e', text)
    text = re.sub(r'[íìîï]', 'i', text)
    text = re.sub(r'[óòôõö]', 'o', text)
    text = re.sub(r'[úùûü]', 'u', text)
    text = re.sub(r'[ç]', 'c', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('  ', ' ', text)

    return text
def normalize_text(text):
    # Normalize text to remove accents and special characters
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
def group_category_for_score_calc(annotation_list, llama_txt):
    llama_txt = remove_upper_and_accents(llama_txt)
    llama_immutable = llama_txt
    cat_to_int = {
        'pathophysiology':1, 
        'etiology':2,
        'epidemiology':3,
        'history':4,
        'physical':5,
        'exams':6,
        'differential':7,
        'therapeutic':8
    }
    categories_order = []
    previous_len = 0
    findings = []
    not_found = 0
    stop = False
    for annotation in annotation_list:
        processed_annotation = remove_upper_and_accents(annotation[0])
        best_match = find_best_match(processed_annotation, llama_txt)
        start_pos = llama_txt.find(best_match)
        for cat in annotation[1]:
            if cat != '0':
                # print('adding', cat, processed_annotation)
                if cat in cat_to_int:
                    # print(annotation)
                    # findings.append(('finding idea...', best_match,'pos', start_pos,(start_pos + len(best_match))))
                    # findings.append(('================================= dynamic text',llama_txt))
                    # print('finding idea...', best_match,'pos', start_pos,start_pos + len(annotation))
                    # print('================================= \n',llama_txt)

                    if start_pos == -1:
                        findings.append(llama_immutable)
                        findings.append(('start pos not found', best_match))
                        not_found += 1
                        print('================================= \n',llama_txt,'=================================')
                        print('========== start pos not found', processed_annotation)
                        print('========= match',  best_match)
                    # print('start', start_pos, 'preivous', previous_len)
                    # print([cat_to_int[cat], start_pos+previous_len])
                    categories_order.append([cat_to_int[cat], start_pos+previous_len])
        if not stop:
            with open(f'print_finding_position.txt', 'a', encoding='utf8') as f:
                for finding in findings:
                    f.write(f"{finding}\n")
            stop = True
        end_pos = start_pos + len(processed_annotation)
        llama_txt = llama_txt[end_pos:]
        previous_len += end_pos   
    if not_found > 0:
        print('not found:', not_found)  
    return categories_order

def find_num_category(annotation_list):
    category_list = []
    for cat in annotation_list:
        for c in cat[1]:
            category_list.append(c)
    # print(category_list)
    distinct_values = list(set(category_list))
    return len(distinct_values)

def format_annotation_to_self_order(llama_annotated):
    # pred_annotations = eval(select_after_first_brace(llama_annotated['response']))
    formatted__annotations = []
    for annotation in pred_annotations:
        start_pos = llama_annotated['text'].find(annotation)
        end_pos = start_pos + len(annotation)
        current_txt = current_txt[end_pos:]
        formatted__annotations.append([annotation, start_pos, end_pos])
def gen_comparation_clustering_self_order_llm_vs_medical_specialist(path, approach):

    teste_progresso = pd.read_csv('../teste-progresso/resultados_anotacoes_teste_progresso_dpoc.csv')

    file_names = [file for file in os.listdir(path) if file.endswith('.json')]

    medical_specialist_metrics = teste_progresso[['annotation id', 'self order groups', 'self order score', 'clustering in free recall']]
    medical_specialist_metrics.columns = ['doc_id', 'self_order_groups', 'self_order_medical_specialist', 'clustering_medical_specialist']

    for name in file_names:
        doc_id = name[-41:-5]
        if len(doc_id) == len('f98e69ee-fda6-4b1c-a8a9-c20b92630cb6'):
            truth_data = get_from_annotated_dataset(annotated_dataset, doc_id)
            with open(f'{path}/{name}', "r", encoding='utf8') as file:
                llama_annotated = json.load(file)
            # print(doc_id)
            formatted_prediction = eval(select_after_first_brace(llama_annotated['response']))
            pred_labels = formatted_prediction['annotations']
            current_txt = llama_annotated['text']
            ideas_number = len(formatted_prediction['annotations'])
            cat_number = find_num_category(pred_labels)
            # print(f"Position of 'BLOQUEIO DO AR NOS PULMÕES': {start_pos}")
            # print(llama_annotated['text'])
            # print(len(formatted_prediction['annotations']))
            # print(len(annotation_to_int), type(len(annotation_to_int)), len(annotation_to_int)>2)
            # print(pred_labels)
            # print(group_category_for_score_calc(pred_labels, llama_annotated['text']))
            
            if len(formatted_prediction['annotations']) > 0:
                annotation_to_int = group_category_for_score_calc(pred_labels, llama_annotated['text'])
                self_order = int(self_order_score(annotation_to_int))
                self_order_group_list = int(self_order_groups(annotation_to_int))
                clustering = clustering_free_recall(annotation_to_int)
                medical_specialist_metrics.loc[medical_specialist_metrics['doc_id'] == doc_id, 'self_order_llm'] = self_order
                medical_specialist_metrics.loc[medical_specialist_metrics['doc_id'] == doc_id, 'clustering_llm'] = clustering

                medical_specialist_metrics.to_csv(f'detailed/detailed_self_order_clustering_metrics_medical_specialist_vs_{approach}.csv', index=False)

def transform_int_to_named_cat(int_cat):
    int_to_cat = {
        1: 'pathophysiology', 
        2: 'etiology',
        3: 'epidemiology',
        4: 'history',
        5: 'physical',
        6: 'exams',
        7: 'differential',
        8: 'therapeutic'
    }
    self_order_translated = []

    for i in int_cat:
        cat = int_to_cat[i[0]]
        rep_cat = i[2]
        self_order_translated.append(f'{cat}:{i[1]}/{rep_cat};')
    self_order_translated_str = ' '.join(self_order_translated)
    return self_order_translated_str
def gen_augmented_annotation_self_order_group_named(path, approach):

    file_names = [file for file in os.listdir(path) if file.endswith('.json')]
    self_order_group_results = pd.DataFrame(columns=['annotation id', 'categories ordered'])
    for name in file_names:
        doc_id = name[-41:-5]
        # print(doc_id)
        if len(doc_id) == len('f98e69ee-fda6-4b1c-a8a9-c20b92630cb6'):
            truth_data = get_from_annotated_dataset(annotated_dataset, doc_id)
            with open(f'{path}/{name}', "r", encoding='utf8') as file:
                llama_annotated = json.load(file)
            formatted_prediction = transform_augmented_data_to_pattern(llama_annotated['response'])
            if formatted_prediction['annotations'] == []:
                print(doc_id)
                print(formatted_prediction)
                
            if len(formatted_prediction) == 0:
                print(doc_id,'====================================================')
            # print('======================')
            # if formatted_prediction[-1] =! '{':
            #     formatted_prediction = '{'+formatted_prediction
            # formatted_prediction = eval(select_after_first_brace(llama_annotated))
            pred_labels = formatted_prediction['annotations']
            current_txt = llama_annotated['text']
            ideas_number = len(formatted_prediction['annotations'])
            cat_number = find_num_category(pred_labels)
            
            if len(formatted_prediction['annotations']) > 0:
                annotation_to_int = group_category_for_score_calc(pred_labels, llama_annotated['text'])
                self_order_group_list = transform_int_to_named_cat(self_order_groups(annotation_to_int))
                # print(self_order_group_list)
                
                new_row = pd.DataFrame({'annotation id': [doc_id], 'categories ordered': [self_order_group_list]})
                self_order_group_results = pd.concat([self_order_group_results, new_row], ignore_index=True)
                self_order_group_results.to_csv(f'teste_progresso_self_order_groups_{approach}.csv', index=False)

def gen_self_order_group_named(path, approach):

    file_names = [file for file in os.listdir(path) if file.endswith('.json')]
    self_order_group_results = pd.DataFrame(columns=['annotation id', 'categories ordered'])
    for name in file_names:
        doc_id = name[-41:-5]
        if len(doc_id) == len('f98e69ee-fda6-4b1c-a8a9-c20b92630cb6'):
            truth_data = get_from_annotated_dataset(annotated_dataset, doc_id)
            with open(f'{path}/{name}', "r", encoding='utf8') as file:
                llama_annotated = json.load(file)
            formatted_prediction = select_after_first_brace(llama_annotated['response'])
            # print('##############################\n')
            # print(formatted_prediction)
            formatted_prediction = eval(formatted_prediction)
            
            pred_labels = formatted_prediction['annotations']
            current_txt = llama_annotated['text']
            ideas_number = len(formatted_prediction['annotations'])
            cat_number = find_num_category(pred_labels)
        
        
            if len(formatted_prediction['annotations']) > 0:
                annotation_to_int = group_category_for_score_calc(pred_labels, llama_annotated['text'])
                self_order_group_list = transform_int_to_named_cat(self_order_groups(annotation_to_int))
                # print(self_order_group_list)
                
                new_row = pd.DataFrame({'annotation id': [doc_id], 'categories ordered': [self_order_group_list]})
            else:
                print('No annotations found for', doc_id)
                print(llama_annotated['response'],'\n')
                print(formatted_prediction,'\n=================')
                print('==============================================\n')
                new_row = pd.DataFrame({'annotation id': [doc_id], 'categories ordered': ['']})
            self_order_group_results = pd.concat([self_order_group_results, new_row], ignore_index=True)
            self_order_group_results.to_csv(f'teste_progresso_self_order_groups_{approach}.csv', index=False)

#     return best_match
def normalize_text(text):
    # Normalize text to remove accents and special characters
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
def find_best_match(phrase, text):
    normalized_phrase = remove_upper_and_accents(phrase.lower())
    normalized_text = remove_upper_and_accents(text.lower())
    
    # Tokenize the normalized phrase and text
    # phrase_tokens = re.findall(r'\w+|[^\w\s]', normalized_phrase, re.UNICODE)
    # text_tokens = re.findall(r'\w+|[^\w\s]', normalized_text, re.UNICODE)
    phrase_tokens = normalized_phrase.split()
    text_tokens = normalized_text.split()
    
    best_match_tokens = []
    best_match = ""
    best_score = 0
    
    # Define the range of lengths to consider for text slices
    min_length = max(1, len(phrase_tokens) - 1)
    max_length = len(phrase_tokens) + 1
    
    for length in range(min_length, max_length + 1):
        for i in range(len(text_tokens) - length + 1):
            # Extract a slice of the text tokens
            text_slice = text_tokens[i:i + length]
            text_slice_str = ''
            text_slice_str = " ".join(text_slice)
            # print(text_slice)
            # print('VS ==== '," ".join(phrase_tokens))
            # Calculate the fuzzy matching score
            score = fuzz.ratio(" ".join(phrase_tokens), text_slice_str)
            # Update the best match if the current score is higher
            if score > best_score:
                best_score = score
                best_match = text_slice_str
                best_match_tokens = text_slice
    

    return best_match

In [None]:
phrase = "nos seus pulmoes"
text = "nos pulmoes é comum um caso de filler pulmoes nos é diars"
best_match = find_best_match(phrase, text)
print(f"Best match: {best_match}")

In [None]:
test_txt = 'A doença pulmonar obstrutiva cronica é uma obstrução pulmonar irreversivel, caracterizada pela clinica de tosse cronica, expectoração cornica'
test_match = 'Doença pulmonar obstrutiva crônica (DPOC)'
print(find_best_match(test_match, test_txt))

In [None]:
import os
import json
import csv
path_0_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-0-shot'
path_0_shot_raw_typos = f'../llama-outputs/raw-typos/ideas-0-shot'

path_1_static_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-1-shot'
path_2_static_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-2-shot'
path_3_static_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-3-shot'
path_4_static_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-4-shot'
path_10_static_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-10-shot'

path_1_tf_idf_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-1-shot'
path_2_tf_idf_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-2-shot'
path_3_tf_idf_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-3-shot'
path_4_tf_idf_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-4-shot'
path_4_tf_idf_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-10-shot'

path_1_tf_idf_custom_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-1-shot'
path_2_tf_idf_custom_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-2-shot'
path_3_tf_idf_custom_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-3-shot'
path_4_tf_idf_custom_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-4-shot'
path_10_tf_idf_custom_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-10-shot'

path_1_random_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-1-shot'
path_2_random_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-2-shot'
path_3_random_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-3-shot'
path_4_random_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-4-shot'
path_10_random_shot = f'../llama-outputs/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-10-shot'

print('generating ZERO-SHOT')
gen_self_order_group_named(path_0_shot, '0_shot')

print('generating ONE-SHOT STATIC')
gen_self_order_group_named(path_1_static_shot, '1_static_shot')
print('generating TWO-SHOT STATIC')
gen_self_order_group_named(path_2_static_shot, '2_static_shot')
print('generating THREE-SHOT STATIC')
gen_self_order_group_named(path_3_static_shot, '3_static_shot')
print('generating FOUR-SHOT STATIC')
gen_self_order_group_named(path_4_static_shot, '4_static_shot')
print('generating TEN-SHOT STATIC')
gen_self_order_group_named(path_10_static_shot, '10_static_shot')


print('generating ONE-SHOT TF-IDF')
gen_self_order_group_named(path_1_tf_idf_shot, '1_tf_idf_shot')
print('generating TWO-SHOT TF-IDF')
gen_self_order_group_named(path_2_tf_idf_shot, '2_tf_idf_shot')
print('generating THREE-SHOT TF-IDF')
gen_self_order_group_named(path_3_tf_idf_shot, '3_tf_idf_shot')
print('generating FOUR-SHOT TF-IDF')
gen_self_order_group_named(path_4_tf_idf_shot, '4_tf_idf_shot')
print('generating TEN-SHOT TF-IDF')
gen_self_order_group_named(path_4_tf_idf_shot, '10_tf_idf_shot')

print('generating ONE-SHOT TF-IDF CUSTOM')
gen_self_order_group_named(path_1_tf_idf_custom_shot, '1_tf_idf_custom_shot')
print('generating TWO-SHOT TF-IDF CUSTOM')
gen_self_order_group_named(path_2_tf_idf_custom_shot, '2_tf_idf_custom_shot')
print('generating THREE-SHOT TF-IDF CUSTOM')
gen_self_order_group_named(path_3_tf_idf_custom_shot, '3_tf_idf_custom_shot')
print('generating FOUR-SHOT TF-IDF CUSTOM')
gen_self_order_group_named(path_4_tf_idf_custom_shot, '4_tf_idf_custom_shot')
print('generating TEN-SHOT TF-IDF CUSTOM')
gen_self_order_group_named(path_10_tf_idf_custom_shot, '10_tf_idf_custom_shot')

print('generating ONE-SHOT STATIC')
gen_self_order_group_named(path_1_random_shot, '1_random_shot')
print('generating TWO-SHOT random')
gen_self_order_group_named(path_2_random_shot, '2_random_shot')
print('generating THREE-SHOT random')
gen_self_order_group_named(path_3_random_shot, '3_random_shot')
print('generating FOUR-SHOT random')
gen_self_order_group_named(path_4_random_shot, '4_random_shot')
print('generating TEN-SHOT random')
gen_self_order_group_named(path_10_random_shot, '10_random_shot')

In [None]:
path_0_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-0-shot'

path_1_static_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-1-shot'
path_2_static_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-2-shot'
path_3_static_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-3-shot'
path_4_static_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-4-shot'
path_10_static_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-static-10-shot'

path_1_tf_idf_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-1-shot'
path_2_tf_idf_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-2-shot'
path_3_tf_idf_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-3-shot'
path_4_tf_idf_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-4-shot'
path_4_tf_idf_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/ideas-tf-idf-10-shot'

path_1_tf_idf_custom_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-1-shot'
path_2_tf_idf_custom_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-2-shot'
path_3_tf_idf_custom_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-3-shot'
path_4_tf_idf_custom_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-4-shot'
path_10_tf_idf_custom_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/tf-idf-custom/ideas-tf-idf-10-shot'

path_1_random_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-1-shot'
path_2_random_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-2-shot'
path_3_random_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-3-shot'
path_4_random_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-4-shot'
path_10_random_shot_augmented = f'../llama-outputs-augmented/full-dataset/no-short-data/temp-0.0/top-p-0.6/random-shot-retrieval/ideas-random-10-shot'

print('generating ZERO-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_0_shot_augmented, '0_shot_aug')

print('generating ONE-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_1_static_shot_augmented, '1_static_shot_aug')
print('generating TWO-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_2_static_shot_augmented, '2_static_shot_aug')
print('generating THREE-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_3_static_shot_augmented, '3_static_shot_aug')
print('generating FOUR-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_4_static_shot_augmented, '4_static_shot_aug')
print('generating TEN-SHOT STATIC augmented annotation')
gen_augmented_annotation_self_order_group_named(path_10_static_shot_augmented, '10_static_shot_aug')


print('generating ONE-SHOT TF-IDF augmented annotation')
gen_augmented_annotation_self_order_group_named(path_1_tf_idf_shot_augmented, '1_tf_idf_shot_aug')
print('generating TWO-SHOT TF-IDF augmented annotation')
gen_augmented_annotation_self_order_group_named(path_2_tf_idf_shot_augmented, '2_tf_idf_shot_aug')
print('generating THREE-SHOT TF-IDF augmented annotation')
gen_augmented_annotation_self_order_group_named(path_3_tf_idf_shot_augmented, '3_tf_idf_shot_aug')
print('generating FOUR-SHOT TF-IDF augmented annotation')
gen_augmented_annotation_self_order_group_named(path_4_tf_idf_shot_augmented, '4_tf_idf_shot_aug')
print('generating TEN-SHOT TF-IDF augmented annotation')
gen_augmented_annotation_self_order_group_named(path_4_tf_idf_shot_augmented, '10_tf_idf_shot_aug')

print('generating ONE-SHOT TF-IDF CUSTOM augmented annotation')
gen_augmented_annotation_self_order_group_named(path_1_tf_idf_custom_shot_augmented, '1_tf_idf_custom_shot_aug')
print('generating TWO-SHOT TF-IDF CUSTOM augmented annotation')
gen_augmented_annotation_self_order_group_named(path_2_tf_idf_custom_shot_augmented, '2_tf_idf_custom_shot_aug')
print('generating THREE-SHOT TF-IDF CUSTOM augmented annotation')
gen_augmented_annotation_self_order_group_named(path_3_tf_idf_custom_shot_augmented, '3_tf_idf_custom_shot_aug')
print('generating FOUR-SHOT TF-IDF CUSTOM augmented annotation')
gen_augmented_annotation_self_order_group_named(path_4_tf_idf_custom_shot_augmented, '4_tf_idf_custom_shot_aug')
print('generating TEN-SHOT TF-IDF CUSTOM augmented annotation')
gen_augmented_annotation_self_order_group_named(path_10_tf_idf_custom_shot_augmented, '10_tf_idf_custom_shot_aug')

print('generating ONE-SHOT random augmented annotation')
gen_augmented_annotation_self_order_group_named(path_1_random_shot_augmented, '1_random_shot_aug')
print('generating TWO-SHOT random augmented annotation')
gen_augmented_annotation_self_order_group_named(path_2_random_shot_augmented, '2_random_shot_aug')
print('generating THREE-SHOT random augmented annotation')
gen_augmented_annotation_self_order_group_named(path_3_random_shot_augmented, '3_random_shot_aug')
print('generating FOUR-SHOT random augmented annotation')
gen_augmented_annotation_self_order_group_named(path_4_random_shot_augmented, '4_random_shot_aug')
print('generating TEN-SHOT random augmented annotation')
gen_augmented_annotation_self_order_group_named(path_10_random_shot_augmented, '10_random_shot_aug')

In [None]:
test = """"annotations": "DPOC trata-se de uma condição na qual há [diminuição do fluxo de ar nas vias aéreas | pathophysiology], ocorrendo em quadros como [enfisema | physical] e [bronquite crônica | exams]. Trata-se de uma [doença pulmonar crônica | history], e está muito [associada a exposição a fumaça de cigarro | epidemiology]. Um [sinal clínico de sua presença é o "tórax em barril" | physical]."}"""
matches = re.findall(r"""(?<=:)[\s\S]*["'](.*)(?<!\\)["'](?=\s*[}])""", test)
print(matches)

In [None]:
path_0_shot_raw_typos = f'../llama-outputs/raw-typos/ideas-0-shot'
gen_self_order_group_named(path_0_shot_raw_typos, '0_shot_raw_typos')

In [None]:
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_0_shot, '0_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_1_static_shot, '1_static_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_2_static_shot, '2_static_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_3_static_shot, '3_static_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_4_static_shot, '4_static_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_10_static_shot, '10_static_shot')


gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_1_tf_idf_shot, '1_tf_idf_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_2_tf_idf_shot, '2_tf_idf_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_3_tf_idf_shot, '3_tf_idf_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_4_tf_idf_shot, '4_tf_idf_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_4_tf_idf_shot, '10_tf_idf_shot')

gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_1_tf_idf_custom_shot, '1_tf_idf_custom_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_2_tf_idf_custom_shot, '2_tf_idf_custom_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_3_tf_idf_custom_shot, '3_tf_idf_custom_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_4_tf_idf_custom_shot, '4_tf_idf_custom_shot')
gen_comparation_clustering_self_order_llm_vs_medical_specialist(path_10_tf_idf_custom_shot, '10_tf_idf_custom_shot')

            # with open(f'self_order_clustering_metrics.csv', mode='a', newline='') as file:
            #     writer = csv.writer(file)
            #     writer.writerow([doc_id, self_order, clustering, cat_number, ideas_number])
            # print(self_order_score(group_category_for_score_calc(pred_labels)))
            # print(clustering_free_recall(group_category_for_score_calc(pred_labels)))
        # print('\n\n')

# Joining data from Medical Specialist and LLM - Self Order Groups, Score and Clustering in Free Recall

In [None]:
import glob
llm_metrics_files = glob.glob("self_order_clustering_metrics_medical_specialist_vs_*.csv")
medical_specialist_metrics_and_groups = pd.read_csv('../teste-progresso/resultados_anotacoes_teste_progresso_dpoc.csv')
llm_groups_files = glob.glob("teste_progresso_self_order_groups_*.csv")
medical_specialist_metrics_and_groups.rename(columns={'annotation id': 'doc_id'}, inplace=True)
medical_specialist_metrics_and_groups = medical_specialist_metrics_and_groups[['doc_id', 'self order groups']]
medical_specialist_metrics_and_groups.columns = ['doc_id', 'self_order_groups_medical_specialist']

In [None]:
llm_metrics_files
# pd.read_csv(file_comp)
for file in llm_metrics_files:
    llm_metrics = pd.read_csv(file)
    approach = file[len('self_order_clustering_metrics_medical_specialist_vs_'):-4]
    llm_groups = pd.read_csv(f'teste_progresso_self_order_groups_{approach}.csv')
    llm_groups.rename(columns = {'self_order_group': 'self_order_groups_llm'}, inplace=True)
    llm_metrics = llm_metrics.merge(medical_specialist_metrics_and_groups, on='doc_id', how='inner')
    llm_metrics = llm_metrics.merge(llm_groups, on='doc_id', how='inner')
    llm_metrics.to_csv(f'detailed/detailed_self_order_clustering_metrics_medical_specialist_vs_{approach}.csv', index=False)