# SageMaker JumpStart - invoke text generation endpoint

This notebook demonstrates how to attach a predictor to an existing endpoint name and invoke the endpoint with example payloads.

In [None]:
!pip install -U sagemaker
%pip install jsonlines
%pip install tdqm

In [None]:
from sagemaker.predictor import retrieve_default

Retrieve a predictor from your deployed endpoint name.

In [None]:
endpoint_name = "jumpstart-dft-llama-3-1-70b-instruct"
predictor = retrieve_default(endpoint_name)

import json
import boto3
from botocore.config import Config

# endpoint_name = "jumpstart-dft-meta-textgeneration-llama-3-70b-instruct"


def query_endpoint(payload):
    config = Config(
    read_timeout=900,
    connect_timeout=900,
    tcp_keepalive=True,
    retries={"max_attempts": 0})

    client = boto3.client("sagemaker-runtime",config=config)
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Body=json.dumps(payload),
    )
    response = response["Body"].read().decode("utf8")
    response = json.loads(response)
    return response

## Medical Specialist

In [None]:
from typing import Dict, List


def format_messages(messages: List[Dict[str, str]]) -> List[str]:
    """Format messages for Llama-3 chat models.
    
    The model only supports 'system', 'user' and 'assistant' roles, starting with 'system', then 'user' and 
    alternating (u/a/u/a/u...). The last message must be from 'user'.
    """
    prompt: List[str] = []
    # print(messages[0]['role'])
    if messages[0]["role"] == "system":
        content = "".join(["<|start_header_id|>system<|end_header_id|>\n\n", messages[0]["content"], "<|eot_id|>", "<|start_header_id|>user<|end_header_id|>\n\n",messages[1]["content"],"<|eot_id|>"])
        messages = [{"role": messages[1]["role"], "content": content}] + messages[2:]

    for user, answer in zip(messages[::2], messages[1::2]):
        prompt.extend(["<|start_header_id|>user<|end_header_id|>", "\n\n", (user["content"]).strip(), " <|eot_id|>"])

    prompt.extend(["<|begin_of_text|>", (messages[0]["content"]).strip(), "<|start_header_id|>assistant<|end_header_id|>\n\n"])

    return "".join(prompt)


llama_config = {"top_p": 0.6,
    "temperature": 0.9,
    "top_k": 50,
    "max_new_tokens": 8192
    }

In [None]:
import jsonlines
def get_static_shots():
    file_path_example = 'static_examples.jsonl'
    with jsonlines.open(file_path_example) as reader:
        data = [line for line in reader]
    return data

static_shots = get_static_shots()
print(static_shots[0])

====================================

Augmented annotation format for NER (Encoder-decoder transformers) - Athiwaratkun et al, 2020

Example of annotation format below:
This is an example entity. = This is an [example entity | example].

"Given a passage, your task is to
extract all entities and identify their
entity types from this list: test,
treatment, problem. The output
should be in a list of tuples of the
following format"

=====================================

GPT-NER format of NER augmentation Wang et al, 2023
This approach annotates categories individually 

"The task is to label [Entity Type] entities in the
given sentence. Here are some examples"

@@Columbus## is a city[...]

====================================

In [None]:
import jsonlines
def get_text_example(text):
    return str({"text":text})
def get_annotation_example(text):
    return str({"annotation":text})
def get_examples(file_path):
    with jsonlines.open(file_path) as reader:
        data = [line for line in reader]
    return data

def get_from_annotated_dataset(annotated_dataset,_id):
    for doc in annotated_dataset:
        if doc['doc_id'] == _id:
            return doc

system_ideas_augmented_athiwaratkun_zero_shot = """You are a medical assistant with expertise in medical document processing.\n
 Your task is to tag entities related to these classes ONLY: pathophysiology, etiology, epidemiology, history, physical, exams, differential e therapeutic.
You must be augment the text by tagging entity types for each word directly within the text. Follow this format: "This is an [example | entityType], with more text".
All the texts will be in portuguese. DO NOT translate the classes. ONLY use JSON as the output format, starting with 'annotations' and the value being the annotated text. DO NOT write, only respond in JSON format.\n"""

system_ideas_augmented_athiwaratkun_1_shot = """ """
system_ideas_augmented_athiwaratkun_2_shot = """ """

system_ideas_augmented_athiwaratkun_shot_dynamic = """ You are a medical assistant with expertise in medical document processing.\n
Your task is to extract all entities and identify their entity types ONLY from this list: pathophysiology, etiology, epidemiology, history, physical, exams, differential e therapeutic.
You must be augment the text by tagging entity types for each word directly within the text. Follow this format: "This is an example phrase, here is the [entity | entityType]".
All the texts will be in portuguese. ONLY use JSON as the output format, starting with 'annotations' and the value being the annotated text. 
DO NOT write, only respond in JSON format.\n
Examples of user input and assistant output:\n {{shot}}"""

# system_ideas_full_in_order = """You are a medical assistant with expertise in medical document processing.\n Your task is to tag entities related to these classes ONLY: pathophysiology, etiology, epidemiology, history, physical, exams, differential e therapeutic. Each line must include: (1) word or phrase, (2) class or classes that the text is a part of. Maintain the correct format. Example: ['token', ['physical']]. All the texts will be in portuguese. ONLY use JSON as the output format, starting with 'annotations'. DO NOT write, only respond in JSON format. Examples of user input and assistant output:\n "user input":\n "A Doença Pulmonar Obstrutiva Crônica é uma condição de insuficiência respiratória de padrão obstrutivo, que ocorre por lesão crônica do parêqnuima pulmonar, a qual culmina em diminuição da complacência pulmonar. A fisiopatogenia envolve um processo inflamatório crônico das vias aéreas e do parênquima que não permite a saída de ar dos pulmões e leva ao acúmulo de volume morto nos alvéolos, mantendo o tórax hiperinsuflado, com acúmulo de CO2 e dificultando as trocas respiratórias.\nA causa mais comum para DPOC é o tabagismo, mas outras causas incluem a convivência com forno a lenha por longo tempo ou a deficiência genética de alfa-1-antitripsina.\nO principal sintoma desses pacientes é a dispneia, que se incia em grandes esforços e pode chegar até ao repouso. O diagnóstico é feito pela clínica + espirometria. A principal complicação são as exacerbações de doença que pode ser associada a quadro infeccioso sobreposto.\nEsses pacientes podem ser divididos segundo os critérios do GOLD entre pacientes muito sintomátisoc&nbsp; e com muitas exacerbações, e o tratamento utiliza LABA, SABA LAMA e CI a depender desses\n', 'assistant output':\n "annotations": [['tabagismo', ['epidemiology', 'etiology'], ['forno a lenha', ['epidemiology', 'etiology'], ['forno a lenha + por longo tempo', ['epidemiology'], ['deficiência genética de alfa-1-antitripsina', ['etiology'], ['diagnóstico é feito pela clínica + espirometria', ['exams', 'history'], ['dispneia', ['history'], ['dispneia + incia em grandes esforços', 693, 734, ['history'], ['dispneia + incia em grandes esforços + pode chegar até ao repouso', 693, 763, ['history'], ['exacerbações de doença', ['history', 'pathophysiology'], ['exacerbações de doença + quadro infeccioso', ['history', 'pathophysiology'], ['podem ser divididos segundo os critérios do GOLD',['history'], ['critérios do GOLD + muito sintomátiso + muitas exacerbações', ['history'], ['lesão crônica do parêqnuima pulmonar', ['pathophysiology'], ['diminuição da complacência pulmonar', ['pathophysiology'], ['processo inflamatório crônico', ['pathophysiology'], ['não permite a saída de ar dos pulmões', ['pathophysiology'], ['acúmulo de volume morto nos alvéolos', ['pathophysiology'], ['tórax hiperinsuflado', ['pathophysiology'], ['acúmulo de CO2', ['pathophysiology'], ['dificultando as trocas respiratórias', ['pathophysiology'], ['insuficiência respiratória', ['pathophysiology'], ['LABA', ['therapeutic']], ['SABA', ['therapeutic']], ['LAMA', ['therapeutic']], ['CI', ['therapeutic']]]\n"""

# system_ideas_full_in_order_2_shot = """You are a medical assistant with expertise in medical document processing.\n Your task is to tag entities related to these classes ONLY: pathophysiology, etiology, epidemiology, history, physical, exams, differential e therapeutic. Each line must include: (1) word or phrase, (2) class or classes that the text is a part of. Maintain the correct format. Example: ['token', ['physical']]. All the texts will be in portuguese. ONLY use JSON as the output format, starting with 'annotations'. DO NOT write, only respond in JSON format. Examples of user input and assistant output:\n "user input":\n "A Doença Pulmonar Obstrutiva Crônica é uma condição de insuficiência respiratória de padrão obstrutivo, que ocorre por lesão crônica do parêqnuima pulmonar, a qual culmina em diminuição da complacência pulmonar. A fisiopatogenia envolve um processo inflamatório crônico das vias aéreas e do parênquima que não permite a saída de ar dos pulmões e leva ao acúmulo de volume morto nos alvéolos, mantendo o tórax hiperinsuflado, com acúmulo de CO2 e dificultando as trocas respiratórias.\nA causa mais comum para DPOC é o tabagismo, mas outras causas incluem a convivência com forno a lenha por longo tempo ou a deficiência genética de alfa-1-antitripsina.\nO principal sintoma desses pacientes é a dispneia, que se incia em grandes esforços e pode chegar até ao repouso. O diagnóstico é feito pela clínica + espirometria. A principal complicação são as exacerbações de doença que pode ser associada a quadro infeccioso sobreposto.\nEsses pacientes podem ser divididos segundo os critérios do GOLD entre pacientes muito sintomátisoc&nbsp; e com muitas exacerbações, e o tratamento utiliza LABA, SABA LAMA e CI a depender desses\n', 'assistant output':\n "annotations": [['tabagismo', ['epidemiology', 'etiology'], ['forno a lenha', ['epidemiology', 'etiology'], ['forno a lenha + por longo tempo', ['epidemiology'], ['deficiência genética de alfa-1-antitripsina', ['etiology'], ['diagnóstico é feito pela clínica + espirometria', ['exams', 'history'], ['dispneia', ['history'], ['dispneia + incia em grandes esforços', 693, 734, ['history'], ['dispneia + incia em grandes esforços + pode chegar até ao repouso', 693, 763, ['history'], ['exacerbações de doença', ['history', 'pathophysiology'], ['exacerbações de doença + quadro infeccioso', ['history', 'pathophysiology'], ['podem ser divididos segundo os critérios do GOLD',['history'], ['critérios do GOLD + muito sintomátiso + muitas exacerbações', ['history'], ['lesão crônica do parêqnuima pulmonar', ['pathophysiology'], ['diminuição da complacência pulmonar', ['pathophysiology'], ['processo inflamatório crônico', ['pathophysiology'], ['não permite a saída de ar dos pulmões', ['pathophysiology'], ['acúmulo de volume morto nos alvéolos', ['pathophysiology'], ['tórax hiperinsuflado', ['pathophysiology'], ['acúmulo de CO2', ['pathophysiology'], ['dificultando as trocas respiratórias', ['pathophysiology'], ['insuficiência respiratória', ['pathophysiology'], ['LABA', ['therapeutic']], ['SABA', ['therapeutic']], ['LAMA', ['therapeutic']], ['CI', ['therapeutic']]]\n"user input":\n "DPOC é uma doença que costuma ocorrer em idosos e muito associada ao tabagismo e inalação de demais partículas tóxicas, com alta prevalência.\nÉ caracterizada por enfisema e bronquite, havendo tanto o padrão clássico do paciente soprador rosado (magro, avermelhado, predomina enfisema) quanto do tossidor azul (cianótico, sobrepeso, predomina bronquite).\nComo sintomas clássicos, a DPOC tem como sintomas tosse expectorante crônica, dispneia, infecções de repetição, edema. No exame físico, nota-se timpanismo, tórax aumentado em volume, respiração não enche plenamente a caixa torácica, por vezes uso de musculatura acessória, ruídos adventícios.\n","assistant output":\n"annotations":[['idosos', ['epidemiology']], ['muito associada ao tabagismo', ['epidemiology']], ['inalação de demais partículas tóxicas', ['epidemiology', 'etiology']], ['alta prevalência', ['epidemiology']], ['enfisema', ['physical', 'pathophysiology']], ['bronquite', ['physical', 'pathophysiology']], ['soprador rosado', ['pathophysiology', 'physical']], ['magro', ['physical']], ['avermelhado', ['physical']], ['predomina enfisema', ['pathophysiology']], ['tossidor azul', ['pathophysiology', 'physical']], ['cianótico', ['physical']], ['sobrepeso', ['physical']], ['predomina bronquite', ['pathophysiology']], ['tosse expectorante crônica', ['history']], ['dispneia', ['history']], ['infecções de repetição', ['history']], ['edema', ['physical']], ['timpanismo', ['physical']], ['tórax aumentado em volume', ['physical']], ['respiração não enche plenamente a caixa torácica', ['uso de musculatura acessória', ['physical']],['ruídos adventícios', ['physical']]]"""

file_path = 'prompt-shots.jsonl'
examples = get_examples(file_path)
annotated_dataset = get_examples('teste-progresso/annotations-medical_specialist-dpoc-json.jsonl')

In [None]:
import random
from time import sleep
import sys
def set_prompt(prompt_type,prompt_category,prompt_guide_level, prompt_pos, prompt_shot_amount,question, sys_prompt_name='default'):
    # user_prompt = prompt_type[prompt_guide_level][prompt_pos]
    ideas_shot_template = "P. Fale TUDO que você sabe sobre doença pulmonar obstrutiva crônica. TODAS as informações que você souber são importantes.\nR.<<{{studentResponse}}>>\nSeparação e classificação de ideias:\n<<{{annotationsResponse}}>>"
    
    system_prompt = "Você é um médico especialista com conhecimento médico e clínico sobre medicina interna, principalmente sobre as doenças doença pulmonar obstrutiva crônica (DPOC). Além disso, você consegue diferenciar entre textos de especialistas sobre IAM e DPOC e textos de alunos. Você é capaz de classificar e separar ideias no texto. Não crie informações que não sejam verdadeiras. SOMENTE responda no formato JSON. Aqui está um exemplo de resposta aceitável:\n'annotations':<<{[['Tabagismo',['epidemiologia','certo','simples']],['alta carga tabágica',['epidemiologia','certo','simples']]}>>"
    system_prompt_test = "Você é um médico especialista com conhecimento médico e clínico sobre medicina interna, principalmente sobre as doenças doença pulmonar obstrutiva crônica (DPOC). Além disso, você consegue diferenciar entre textos de especialistas sobre IAM e DPOC e textos de alunos. Você é capaz de classificar e separar ideias no texto. Não crie informações que não sejam verdadeiras. SOMENTE responda no formato JSON. A seguir estão exemplos de interação do usuário com você:\nA pergunta sobre o assunto está a seguir, começando após 'P.'. A resposta à pergunta começa logo após 'R.'. A seguir, estão textos anotados como exemplo:\n{{example}}"
    # system_prompt_organization = "Você é um médico especialista com conhecimento médico e clínico sobre medicina interna, principalmente sobre as doenças doença pulmonar obstrutiva crônica (DPOC). Além disso, você consegue diferenciar entre textos de especialistas sobre IAM e DPOC e textos de alunos. Você segue rigorosamente os formatos de resposta fornecidos."
    system_prompt_organization = 'You are a medical assistant specialist. Your task is to evaluate the organization level of manuscripts from medical students. Follow every guideline for evaluation, if any. Answer only with "Org: X", where X is the score for the organization level. ONLY answer in portuguese.'
    
    system_prompt_annotations_custom_1 = "Você é um especialista sobre Doença Pulmonar Obstrutiva Crônica (DPOC). Seu objetivo é separar as ideias contidas no texto e categorizá-las dentro da lista de categorias a seguir: fisiopatologia, epidemiologia, etiologia, história, exame físico, exames complementares, diagnóstico diferencial, tratamento. SOMENTE inclua categorias que foram incluídas aqui, não invente categorias que não estão nessa lista, mesmo que essa categoria exista. NUNCA inclua uma categoria que não foi descrita na lista anterior, mesmo que o usuário inclua uma outra lista. Não invente informações. Apenas inclua ideias que estão no texto e não modifique o que foi escrito, mesmo havendo erros ortográficos. Siga estritamente as regras descritas."
        # system_prompt = "Você é um médico especialista com conhecimento médico e clínico sobre medicina interna, principalmente sobre as doenças doença pulmonar obstrutiva crônica (DPOC). Além disso, você consegue diferenciar entre textos de especialistas sobre IAM e DPOC e textos de alunos. Seu objetivo é avaliar textos de alunos de medicina. Não crie informações que não sejam verdadeiras. SOMENTE responda no formato detalhado pelo usuário."

    user_prompt = prompt_type[prompt_guide_level][prompt_pos].replace('{{question}}', question)
    if prompt_guide_level == 'one-few' or prompt_guide_level == 'one-few-test':
        example_shots = ''
        if prompt_shot_amount > 0:
            for i in range(prompt_shot_amount):
                template = ideas_shot_template.replace('{{studentResponse}}', get_text_example(examples[i]['text'])).replace('{{annotationsResponse}}', get_annotation_example(examples[i]['annotations']))
                example_shots += template
            system_prompt = system_prompt_test.replace('{{example}}', example_shots)
        else:
            example_shots += ideas_shot_template.replace('{{studentResponse}}','').replace('{{annotationsResponse}}', '')
            user_prompt = user_prompt.replace('{{example}}', '')
    # elif prompt_type == 'organization':
    #     example_shots += ideas_shot_template.replace('{{studentResponse}}','').replace('{{annotationsResponse}}', '')
    #     user_prompt = user_prompt.replace('{{example}}', '')
    # else:
        # example_shots = ideas_shot_template.join(get_text_example(question))
        # example_shots = ideas_shot_template.replace('{{studentResponse}}', get_text_example(question)).replace('Separação e classificação de ideias:\n<<{{annotationsResponse}}>>', '')
    # print(prompt_type[prompt_guide_level][prompt_pos])
    # user_prompt = user_prompt.replace('\\n', '\n')
    if prompt_category == 'organization':
        dialog = [
            { "role": "system","content": system_prompt_organization},
            {"role": "user", "content": user_prompt}
        ]
    elif sys_prompt_name == 'annotations_custom_1':
        dialog = [
            { "role": "system","content": system_prompt_annotations_custom_1},
            {"role": "user", "content": user_prompt}
        ]
    elif prompt_category == 'custom':
        dialog = [
            { "role": "system","content": sys_prompt_name},
            {"role": "user", "content": user_prompt}
        ]
    else:
        dialog = [
            { "role": "system","content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    input_prompt = format_messages(dialog)
    return input_prompt

def set_custom_prompt(system_prompt, user_prompt):
    dialog = [
            { "role": "system","content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    input_prompt = format_messages(dialog)
    return input_prompt

def prompt_routine(input_prompt, num_replicas, top_p,temp,top_k,max_new_tokens):
    prompt_list = []
    llama_config["top_p"] = top_p
    llama_config["temperature"] = temp
    llama_config["top_k"] = top_k
    payload = {
    "inputs":  input_prompt,
       "parameters": {
        "do_sample": True,
        "max_new_tokens":max_new_tokens,
        "top_p": llama_config["top_p"],
        "temperature": llama_config["temperature"],
        # "repetition_penalty":llama_config['repetition_penalty'],
        "top_k": llama_config["top_k"],
        "stop": "<|eot_id|>"}
    }
    for i in range(num_replicas):
        predictior_output = query_endpoint(payload)
        # sys.stdout.write('\r')
        # # the exact output you're looking for:
        # sys.stdout.write("[%-20s] %d%%" % ('='*num_replicas, 5*num_replicas))
        # sys.stdout.flush()
        # sleep(0.25)
        prompt_list.append({"generated_text":predictior_output["generated_text"] ,"input":input_prompt})
        
    return prompt_list


In [None]:
import os

path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/'
file_names = os.listdir(path)
used_data = ["f6cb9773-9a81-499c-85c7-3cebe935b930","207c237f-3bcd-4010-bcf1-c1e8b32de6be"]
exisiting_ids = []
for name in file_names:
    doc_id = name[-41:-5]
    if len(doc_id) == len('f98e69ee-fda6-4b1c-a8a9-c20b92630cb6') and doc_id not in used_data:
        used_data.append(doc_id)
        exisiting_ids.append(doc_id)


In [None]:

def run_new_examples (guided_lvl,prompt_category,shot_n):
    random_annotated = random.choice(annotated_dataset)
    while random_annotated['doc_id'] in used_data:
        random_annotated = random.choice(annotated_dataset)
    if prompt_category == 'organization':
        #prompt_type,prompt_category,prompt_guide_level, prompt_pos, prompt_shot_amount,question
            input_prompt = set_prompt(prompt_organization_lvl,'organization',guided_lvl, 0, shot_n,random_annotated['text'],'system_organization_lvl')
    else:
        input_prompt = set_prompt(prompt_ideas_full, guided_lvl, 0, shot_n,random_annotated['text'])
    # prompt_category = 'ideas'
    # guided_lvl = 'less'
    output_batch = prompt_routine(input_prompt, 20,0.6,0.9,llama_config['top_k'],10)
    dict_to_json = {"doc_id": random_annotated['doc_id'],"text": random_annotated['text'], "response": output_batch}
    with open(f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/{prompt_category}-{guided_lvl}-{random_annotated["doc_id"]}.json', 'w') as file:
        json.dump(dict_to_json,file,ensure_ascii=False)
    print('Complete',random_annotated['doc_id'])
def run_examples_from_list (list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/'
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        requested_doc = get_from_annotated_dataset(_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            input_prompt = set_prompt(prompt_ideas_full, guided_lvl, 0, shot_n,requested_doc['text'])
            output_batch = prompt_routine(input_prompt, 40,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'])
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'], "response": output_batch}
            with open(f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

def run_examples_from_list_all_guide_lvl (list_id,prompt_obj,guided_lvl_max,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/{prompt_category}'
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        requested_doc = get_from_annotated_dataset(_id)
        max_guided = False
        for guided_lvl in prompt_obj:
           
            if max_guided == False:
                for prompt_pos in range(len(prompt_obj[guided_lvl])):
                    if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
                        input_prompt = set_prompt(prompt_obj, prompt_category, guided_lvl, prompt_pos, shot_n,requested_doc['text'])
                        output_batch = prompt_routine(input_prompt,20,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],llama_config['max_new_tokens'])
                        dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'], "response": output_batch}
                        with open(f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/{prompt_category}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                            json.dump(dict_to_json,file,ensure_ascii=False)
                        print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)*3}')
                    else:
                        print('Already exists',requested_doc['doc_id'])
                    i += 1
            if guided_lvl == guided_lvl_max:
                max_guided = True

#### Annotating all categories

### Experiment 1 - One prompt for all Categories (Same texts as Gabriel's Test)

In [None]:
import pandas as pd
import os

test_ids = list(pd.read_csv('test_data_info.csv')['doc_id'])
test_ids_no_short = list(pd.read_csv('annotations_medical_specialist_pre_processed_no_short.csv')['doc_id'])

In [None]:
diff = list(set(test_ids) - set(test_ids_no_short))
print(diff)

In [None]:
def run_annotation_examples_from_list (full_dataset,sys_prompt,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            input_prompt = set_custom_prompt(sys_prompt,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'] ,"response": output_batch[0]['generated_text']}
            report_path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

def run_annotation_processed_examples_from_list (full_dataset,sys_prompt,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            input_prompt = set_custom_prompt(sys_prompt,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'] ,"response": output_batch[0]['generated_text']}
            report_path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

#### 1-shot (arbitrary examples) temp 0.9

In [None]:
llama_config['temperature'] = 0.9
run_annotation_processed_examples_from_list(annotated_dataset,system_ideas_full_in_order,test_ids_no_short,'full','ideas',1)
# llama_config['temperature'] = 0.9

#### 2-shot (arbitrary examples) temp 0.9

In [None]:
llama_config['temperature'] = 0.9
run_annotation_processed_examples_from_list(annotated_dataset,system_ideas_full_in_order_2_shot,test_ids_no_short,'full','ideas',2)
# llama_config['temperature'] = 0.9

#### 1-shot (arbitrary examples) temp 0.0

In [None]:
llama_config['temperature'] = 0.0
run_annotation_processed_examples_from_list(annotated_dataset,system_ideas_full_in_order,test_ids_no_short,'full','ideas',1)
# llama_config['temperature'] = 0.9

#### 2-shot (arbitrary examples) temp 0.0

In [None]:
llama_config['temperature'] = 0.0
run_annotation_processed_examples_from_list(annotated_dataset,system_ideas_full_in_order_2_shot,test_ids_no_short,'full','ideas',2)
# llama_config['temperature'] = 0.9

### Experiment 2 - One prompt for all Categories (Information Retrieval for best shot)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


df_data_info = pd.read_csv('test_data_info.csv')
df_no_short_data_info = pd.read_csv('annotations_medical_specialist_pre_processed_no_short.csv')
def minimize_labels(label):
    n_label = []
    if type(label) == str:
        label = eval(label)
    for l in label:
        if l[4] != None:
            n_label.append([l[0],list(l[4].keys())])
        else:
            n_label.append([l[0],l[4]])
    return n_label
def get_only_text(label):
    n_label = []
    if type(label) == str:
        label = eval(label)
    for l in label:
        n_label.append([l[0]])
    return n_label

def bio_to_cluster_annotation (bio_annotation):
    main_annotation = []
    for i in range(len(bio_annotation)):
        list_ann = eval(bio_annotation[i])
        
        sub_annotation = []
        phrase = []
        for z in range(len(list_ann)):
            if list_ann[z][3] == 'B':
                if len(phrase) > 0:
                    sub_annotation.append([' '.join(phrase),list(list_ann[z][4].keys())])
                    phrase = []
                phrase.append(list_ann[z][0])
            elif list_ann[z][3] == 'I':
                phrase.append(list_ann[z][0])
        main_annotation.append(sub_annotation)
    return main_annotation

def extract_example_shot_from_row(_row, output='string'):
    if output == 'list':
        shot_text = {'user_input':_row['text'],'assistant_output':_row['cluster_labels']}
    elif output == 'string':
        shot_text = f"""'user_input':{_row['text']}\n 'assistant_output':"annotations":{_row['cluster_labels']}"""
    return shot_text

def find_top_matches(query, df, top_n):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the text data in the dataframe
    tfidf_matrix = vectorizer.fit_transform(df['text'])

    # Transform the query string
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity between the query vector and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Get the indices of the top 3 matches
    top_indices = similarity_scores.argsort()[0][-1*top_n:][::-1]
    # Get the corresponding documents from the dataframe
    top_matches = df.iloc[top_indices]
    # print(type(top_matches))
    
    return top_matches

## Transform list of annotated tokens into continuous string for TF-IDF
def annotation_token_to_text(df_text):
    annotation_token_to_text = ''
    # for id,row in df_text.iterrows():
        # print(row['cluster_labels'])
        # print(eval(row['cluster_labels'])[0])
        # print('============list_row==')
    list_row = eval(df_text['cluster_labels'])
    for i in range(len(list_row)):
        annotation_token_to_text += list_row[i][0] + ' '
    return annotation_token_to_text

## Retrieving similar examples from complete text (TF-IDF)
def find_top_matches_from_annotation(query, df, top_n):
    # for i in range(len(df['cluster_labels'])):
    #     df['cluster_labels'][i] = minimize_labels(df['cluster_labels'][i])

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the text data in the dataframe
    tfidf_matrix = vectorizer.fit_transform(df['cluster_labels'])

    # Transform the query string
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity between the query vector and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Get the indices of the top 3 matches
    top_indices = similarity_scores.argsort()[0][-1*top_n:][::-1]
    # Get the corresponding documents from the dataframe
    top_matches = df.iloc[top_indices]
    # print(type(top_matches))
    
    return top_matches


In [None]:
def predict_annotations_tf_idf_shots (full_dataset,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-tf-idf-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            df_top_shots = find_top_matches(requested_doc["text"],df_data_info[df_data_info.loc[:, 'text'] != requested_doc["text"]],shot_n)
            
            shots_text = ''
            for j in range(shot_n):
                top_shot = df_top_shots.iloc[j]
                # print('top_shot',top_shot)
                txt = extract_example_shot_from_row(top_shot)
                shots_text += '\n'+txt
            # print('bundle of shots', shots_text)
            replace_sys_instruction = system_ideas_full_in_order_shot_dynamic.replace("{{shot}}", shots_text)
            # print('SYSTEM PROMPT ===========\n',replace_sys_instruction,'#################### END SYSTEM PROMPT #####################')
            input_prompt = set_custom_prompt(replace_sys_instruction,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'], "response": output_batch[0]['generated_text']}
            report_path = path
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'{report_path}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

def predict_annotations_tf_idf_shots_from_preprocessed (full_dataset,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-tf-idf-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            df_top_shots = find_top_matches(requested_doc["text"],df_no_short_data_info[df_no_short_data_info.loc[:, 'text'] != requested_doc["text"]],shot_n)
            
            shots_text = ''
            for j in range(shot_n):
                top_shot = df_top_shots.iloc[j]
                # print('top_shot',top_shot)
                txt = extract_example_shot_from_row(top_shot)
                shots_text += '\n'+txt
            # print('bundle of shots', shots_text)
            replace_sys_instruction = system_ideas_full_in_order_shot_dynamic.replace("{{shot}}", shots_text)
            # print('SYSTEM PROMPT ===========\n',replace_sys_instruction,'#################### END SYSTEM PROMPT #####################')
            input_prompt = set_custom_prompt(replace_sys_instruction,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'], "response": output_batch[0]['generated_text']}
            report_path = path
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'{report_path}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

#### 1-Shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',1)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',1)

#### 2-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',2)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',2)

#### 3-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',3)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',3)

#### 4-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',4)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',4)

#### 10-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',10)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',10)

### Experiment 3 - Zero Shot

In [None]:
import pandas as pd
import os

test_ids = list(pd.read_csv('test_data_info.csv')['doc_id'])
test_ids_no_short = list(pd.read_csv('annotations_medical_specialist_pre_processed_no_short.csv')['doc_id'])

In [None]:
def run_annotation_examples_from_list (full_dataset,sys_prompt,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            input_prompt = set_custom_prompt(sys_prompt,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'] ,"response": output_batch[0]['generated_text']}
            report_path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1
        
def run_annotation_processed_examples_from_list (full_dataset,sys_prompt,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            input_prompt = set_custom_prompt(sys_prompt,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'] ,"response": output_batch[0]['generated_text']}
            report_path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot'
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-{shot_n}-shot/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
        else:
            print('Already exists',requested_doc['doc_id'])
        i += 1

#### zero shot temp 0.0

In [None]:
llama_config['temperature'] = 0.0
run_annotation_processed_examples_from_list(annotated_dataset,system_ideas_augmented_athiwaratkun_zero_shot,test_ids_no_short,'full','ideas',0)
# llama_config['temperature'] = 0.9

### Experiment 4 - Similar annotation instead of similar text (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


df_data_info = pd.read_csv('test_data_info.csv')
df_no_short_data_info = pd.read_csv('annotations_medical_specialist_pre_processed_no_short.csv')
df_medical_specialist_pre_processed = pd.read_csv('annotations_medical_specialist_pre_processed.csv')
def minimize_labels(label):
    n_label = []
    if type(label) == str:
        label = eval(label)
    for l in label:
        if l[4] != None:
            n_label.append([l[0],list(l[4].keys())])
        else:
            n_label.append([l[0],l[4]])
    return n_label
def get_only_text(label):
    n_label = []
    if type(label) == str:
        label = eval(label)
    for l in label:
        n_label.append([l[0]])
    return n_label

def bio_to_cluster_annotation (bio_annotation):
    main_annotation = []
    for i in range(len(bio_annotation)):
        list_ann = eval(bio_annotation[i])
        
        sub_annotation = []
        phrase = []
        for z in range(len(list_ann)):
            if list_ann[z][3] == 'B':
                if len(phrase) > 0:
                    sub_annotation.append([' '.join(phrase),list(list_ann[z][4].keys())])
                    phrase = []
                phrase.append(list_ann[z][0])
            elif list_ann[z][3] == 'I':
                phrase.append(list_ann[z][0])
        main_annotation.append(sub_annotation)
    return main_annotation

def extract_example_shot_from_row(_row, output='string'):
    if output == 'list':
        shot_text = {'user_input':_row['text'],'assistant_output':_row['cluster_labels']}
    elif output == 'string':
        shot_text = f"""'user_input':{_row['text']}\n 'assistant_output':"annotations":{_row['cluster_labels']}"""
    return shot_text

## Transform list of annotated tokens into continuous string for TF-IDF 
def annotation_token_to_text(df_text):
    annotation_token_to_text = ''
    # for id,row in df_text.iterrows():
        # print(row['cluster_labels'])
        # print(eval(row['cluster_labels'])[0])
        # print('============list_row==')
    list_row = eval(df_text['cluster_labels'])
    for i in range(len(list_row)):
        annotation_token_to_text += list_row[i][0] + ' '
    return annotation_token_to_text

## Retrieving similar examples from complete text (TF-IDF)  ##
def find_top_matches_from_annotation(query, df, top_n):
    # for i in range(len(df['cluster_labels'])):
    #     df['cluster_labels'][i] = minimize_labels(df['cluster_labels'][i])

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the text data in the dataframe
    tfidf_matrix = vectorizer.fit_transform(df['cluster_labels'])

    # Transform the query string
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity between the query vector and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Get the indices of the top 3 matches
    top_indices = similarity_scores.argsort()[0][-1*top_n:][::-1]
    # Get the corresponding documents from the dataframe
    top_matches = df.iloc[top_indices]
    # print(type(top_matches))
    
    return top_matches


In [None]:
def predict_annotations_tf_idf_custom_shots (full_dataset,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/ideas-tf-idf-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            df_top_shots = find_top_matches_from_annotation(requested_doc["text"],df_data_info[df_data_info.loc[:, 'text'] != requested_doc["text"]],shot_n)
            
            shots_text = ''
            for j in range(shot_n):
                top_shot = df_top_shots.iloc[j]
                # print('top_shot',top_shot)
                txt = extract_example_shot_from_row(top_shot)
                shots_text += '\n'+txt
            # print('bundle of shots', shots_text)
            replace_sys_instruction = system_ideas_augmented_athiwaratkun_shot_dynamic.replace("{{shot}}", shots_text)
            # print('SYSTEM PROMPT ===========\n',replace_sys_instruction,'#################### END SYSTEM PROMPT #####################')
            input_prompt = set_custom_prompt(replace_sys_instruction,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'], "response": output_batch[0]['generated_text']}
            report_path = path
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'{report_path}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}', end="\r")
        else:
            print('Already exists',requested_doc['doc_id'], end="\r")
        i += 1

def predict_annotations_tf_idf_custom_shots_from_preprocessed (full_dataset,list_id,guided_lvl,prompt_category,shot_n):
    path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/tf-idf-custom/ideas-tf-idf-{shot_n}-shot'
    if not os.path.exists(path):
        os.makedirs(path)
    file_names = os.listdir(path)
    i = 1
    for _id in list_id:
        # print(_id)
        requested_doc = get_from_annotated_dataset(full_dataset,_id)
        if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
            df_top_shots = find_top_matches_from_annotation(requested_doc["text"],df_no_short_data_info[df_no_short_data_info.loc[:, 'text'] != requested_doc["text"]],shot_n)
            
            shots_text = ''
            for j in range(shot_n):
                top_shot = df_top_shots.iloc[j]
                # print('top_shot',top_shot)
                txt = extract_example_shot_from_row(top_shot)
                shots_text += '\n'+txt
            # print('bundle of shots', shots_text)
            replace_sys_instruction = system_ideas_augmented_athiwaratkun_shot_dynamic.replace("{{shot}}", shots_text)
            # print('SYSTEM PROMPT ===========\n',replace_sys_instruction,'#################### END SYSTEM PROMPT #####################')
            input_prompt = set_custom_prompt(replace_sys_instruction,requested_doc['text'])
            # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
            output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'], "response": output_batch[0]['generated_text']}
            report_path = path
            if not os.path.exists(report_path):
                os.makedirs(report_path)
            with open(f'{report_path}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
                json.dump(dict_to_json,file,ensure_ascii=False)
            print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}', end="\r")
        else:
            print('Already exists',requested_doc['doc_id'], end="\r")
        i += 1

#### 1-Shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',1)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',1)

#### 2-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',2)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',2)

#### 3-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',3)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',3)

#### 4-shot

#### 10-shot

In [None]:
llama_config['temperature'] = 0.0
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',10)

In [None]:
llama_config['temperature'] = 0.9
predict_annotations_tf_idf_custom_shots_from_preprocessed(annotated_dataset,test_ids_no_short,'full','ideas',10)

### Experiment 5 - Random Example Retrieval

In [None]:
def find_top_n_shots(query, df, top_n):
    return df.sample(n=top_n)

## Score Assessment by annotation

In [None]:
# def predict_score_by_annotation_custom_shot (full_dataset,list_id,guided_lvl,prompt_category,shot_n):
#     path = f'llama-outputs-augmented/full-dataset/no-short-data/temp-{str(llama_config["temperature"])}/top-p-{str(llama_config["top_p"])}/tf-idf-custom/scoring-tf-idf-{shot_n}-shot'
#     if not os.path.exists(path):
#         os.makedirs(path)
#     file_names = os.listdir(path)
#     i = 1
#     for _id in list_id:
#         # print(_id)
#         requested_doc = get_from_annotated_dataset(full_dataset,_id)
#         if f'{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json' not in file_names:
#             df_top_shots = find_top_matches_from_annotation(requested_doc["text"],df_no_short_data_info[df_no_short_data_info.loc[:, 'text'] != requested_doc["text"]],shot_n)
            
#             shots_text = ''
#             for j in range(shot_n):
#                 top_shot = df_top_shots.iloc[j]
#                 # print('top_shot',top_shot)
#                 txt = extract_example_shot_from_row(top_shot)
#                 shots_text += '\n'+txt
#             # print('bundle of shots', shots_text)
#             replace_sys_instruction = system_ideas_full_in_order_shot_dynamic.replace("{{shot}}", shots_text)
#             # print('SYSTEM PROMPT ===========\n',replace_sys_instruction,'#################### END SYSTEM PROMPT #####################')
#             input_prompt = set_custom_prompt(replace_sys_instruction,requested_doc['text'])
#             # prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
            
#             output_batch = prompt_routine(input_prompt, 1,llama_config['top_p'],llama_config['temperature'],llama_config['top_k'],1000)
#             dict_to_json = {"doc_id": requested_doc['doc_id'],"text": requested_doc['text'],"input":output_batch[0]['input'], "response": output_batch[0]['generated_text']}
#             report_path = path
#             if not os.path.exists(report_path):
#                 os.makedirs(report_path)
#             with open(f'{report_path}/{prompt_category}-{guided_lvl}-{requested_doc["doc_id"]}.json', 'w') as file:
#                 json.dump(dict_to_json,file,ensure_ascii=False)
#             print('Complete',requested_doc['doc_id'],f'{i}/{len(list_id)}')
#         else:
#             print('Already exists',requested_doc['doc_id'])
#         i += 1

In [None]:
# system_ideas_full_in_order_2_shot = """You are a medical assistant with expertise for education evaluation of students.\n Your task is to evaluate the organization medical student texts. After the text, an annotation of . All the texts will be in portuguese. ONLY use JSON as the output format, starting with 'grade'. DO NOT write, only respond in JSON format. 
# Examples of user input and assistant output:\n "user input":\n "A Doença Pulmonar Obstrutiva Crônica é uma condição de insuficiência respiratória de padrão obstrutivo, que ocorre por lesão crônica do parêqnuima pulmonar, a qual culmina em diminuição da complacência pulmonar. A fisiopatogenia envolve um processo inflamatório crônico das vias aéreas e do parênquima que não permite a saída de ar dos pulmões e leva ao acúmulo de volume morto nos alvéolos, mantendo o tórax hiperinsuflado, com acúmulo de CO2 e dificultando as trocas respiratórias.\nA causa mais comum para DPOC é o tabagismo, mas outras causas incluem a convivência com forno a lenha por longo tempo ou a deficiência genética de alfa-1-antitripsina.\nO principal sintoma desses pacientes é a dispneia, que se incia em grandes esforços e pode chegar até ao repouso. O diagnóstico é feito pela clínica + espirometria. A principal complicação são as exacerbações de doença que pode ser associada a quadro infeccioso sobreposto.\nEsses pacientes podem ser divididos segundo os critérios do GOLD entre pacientes muito sintomátisoc&nbsp; e com muitas exacerbações, e o tratamento utiliza LABA, SABA LAMA e CI a depender desses\n', 'assistant output':\n "annotations": [['tabagismo', ['epidemiology', 'etiology'], ['forno a lenha', ['epidemiology', 'etiology'], ['forno a lenha + por longo tempo', ['epidemiology'], ['deficiência genética de alfa-1-antitripsina', ['etiology'], ['diagnóstico é feito pela clínica + espirometria', ['exams', 'history'], ['dispneia', ['history'], ['dispneia + incia em grandes esforços', 693, 734, ['history'], ['dispneia + incia em grandes esforços + pode chegar até ao repouso', 693, 763, ['history'], ['exacerbações de doença', ['history', 'pathophysiology'], ['exacerbações de doença + quadro infeccioso', ['history', 'pathophysiology'], ['podem ser divididos segundo os critérios do GOLD',['history'], ['critérios do GOLD + muito sintomátiso + muitas exacerbações', ['history'], ['lesão crônica do parêqnuima pulmonar', ['pathophysiology'], ['diminuição da complacência pulmonar', ['pathophysiology'], ['processo inflamatório crônico', ['pathophysiology'], ['não permite a saída de ar dos pulmões', ['pathophysiology'], ['acúmulo de volume morto nos alvéolos', ['pathophysiology'], ['tórax hiperinsuflado', ['pathophysiology'], ['acúmulo de CO2', ['pathophysiology'], ['dificultando as trocas respiratórias', ['pathophysiology'], ['insuficiência respiratória', ['pathophysiology'], ['LABA', ['therapeutic']], ['SABA', ['therapeutic']], ['LAMA', ['therapeutic']], ['CI', ['therapeutic']]]\n"user input":\n "DPOC é uma doença que costuma ocorrer em idosos e muito associada ao tabagismo e inalação de demais partículas tóxicas, com alta prevalência.\nÉ caracterizada por enfisema e bronquite, havendo tanto o padrão clássico do paciente soprador rosado (magro, avermelhado, predomina enfisema) quanto do tossidor azul (cianótico, sobrepeso, predomina bronquite).\nComo sintomas clássicos, a DPOC tem como sintomas tosse expectorante crônica, dispneia, infecções de repetição, edema. No exame físico, nota-se timpanismo, tórax aumentado em volume, respiração não enche plenamente a caixa torácica, por vezes uso de musculatura acessória, ruídos adventícios.\n","assistant output":\n"annotations":[['idosos', ['epidemiology']], ['muito associada ao tabagismo', ['epidemiology']], ['inalação de demais partículas tóxicas', ['epidemiology', 'etiology']], ['alta prevalência', ['epidemiology']], ['enfisema', ['physical', 'pathophysiology']], ['bronquite', ['physical', 'pathophysiology']], ['soprador rosado', ['pathophysiology', 'physical']], ['magro', ['physical']], ['avermelhado', ['physical']], ['predomina enfisema', ['pathophysiology']], ['tossidor azul', ['pathophysiology', 'physical']], ['cianótico', ['physical']], ['sobrepeso', ['physical']], ['predomina bronquite', ['pathophysiology']], ['tosse expectorante crônica', ['history']], ['dispneia', ['history']], ['infecções de repetição', ['history']], ['edema', ['physical']], ['timpanismo', ['physical']], ['tórax aumentado em volume', ['physical']], ['respiração não enche plenamente a caixa torácica', ['uso de musculatura acessória', ['physical']],['ruídos adventícios', ['physical']]]"""

# Delete endpoint (stop billing)

In [None]:
# Specify your AWS Region
aws_region='us-east-1'

# Create a low-level SageMaker service client.
sagemaker_client = boto3.client('sagemaker', region_name=aws_region)

# Delete endpoint
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)