### Load data

In [None]:
# Specify the path to the file
test_source_path = './dataset/MPBD/test.en'
test_target_path = './dataset/MPBD/test.sen'

with open(test_source_path, 'r', encoding='utf-8') as file:
    test_source_lines = file.readlines()
with open(test_target_path, 'r', encoding='utf-8') as file:
    test_target_lines = file.readlines()

### Getting results from LLMs

In [None]:
import openai
import os
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from google.colab import userdata
from vertexai.generative_models import GenerationConfig

from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import pipeline


def get_output(prompt, llm):
    if llm == 3.5:
        openai.api_key = ''
        model = 'gpt-3.5-turbo-1106'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif llm == 4:
        openai.api_key = ''
        model = 'gpt-4'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif llm == 'instruct':
        openai.api_key = ''
        model = "gpt-3.5-turbo-instruct"
        message = openai.Completion.create(
            model = model,
            prompt = prompt,
            temperature = 0    
        )
        result = message['choices'][0]['text']   
    
    elif llm == 'gemini-1.0-pro':
        GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel('gemini-pro')

        safety_settings = [
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}
        ]        

        response = model.generate_content(prompt,
                generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                temperature=0),
                safety_settings=safety_settings)
    
        result = response.candidates[0].content.parts[0].text
    

    elif llm == 'flan-ul2':
        model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2", device_map="auto", load_in_8bit=True)                                                                 
        tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
        inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        outputs = model.generate(inputs, max_length=200)
        result = tokenizer.decode(outputs[0])
    

    elif llm == 'med-alpaca':
        pipeline = pipeline("text-generation", model="medalpaca/medalpaca-7b", tokenizer="medalpaca/medalpaca-7b")
        result = pipeline(prompt, max_length=200)[0]['generated_text']

    elif llm == 'pmc-llama':
        tokenizer = LlamaTokenizer.from_pretrained('axiong/PMC_LLaMA_13B')
        model = LlamaForCausalLM.from_pretrained('axiong/PMC_LLaMA_13B')
        encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
        output = model.generate(**encoded_input)
        result = tokenizer.decode(output[0])

    else:
        raise ValueError('Invalid LLM')

    print(result)
    return result

In [None]:
def create_prompt(input_text, prompt_type='base'):
    
    # Initial part of the prompt that describes the task
    prompt = '''### Task
You are a skilled medical expert. Simplify the given medical text, making complex medical information accessible and relevant to people without a medical background.
'''
    
    if prompt_type == 'guide':
        prompt += f'''
### Summarization Guide
Remove unnecessary details: Identify and omit details that may not contribute to a layperson's understanding of the key findings. This could include experimental setups, control experiments, and specific quantitative results.
Provide relevant background information: Offer essential background information such as the prevalence, mortality, risk factors, and outcomes of the condition being discussed. 
Simplify medical jargon: Whenever possible, replace medical terms and jargon with common language or provide clear, concise definitions for technical terms. Minimize the use of abbreviations to ensure clarity.
Sentence structure simplification: Break down complex sentences into shorter, more digestible segments. 
'''
        
    elif prompt_type == '1shot':
        prompt += f'''
### Examples
Example Input 1: in the present study we sought to critically test whether earlier findings on the positive effect of sensorimotor rhythm neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study on 25 patients with insomnia.
Example Output 1: in the study researchers sought to test whether earlier findings on the positive effect of neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study.
'''  

    elif prompt_type == '3shot':
        prompt += f'''
### Examples
Example Input 1: in the present study we sought to critically test whether earlier findings on the positive effect of sensorimotor rhythm neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study on 25 patients with insomnia.
Example Output 1: in the study researchers sought to test whether earlier findings on the positive effect of neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study.

Example Input 2: exposure to unilateral oophorectomy is associated with a 7-year earlier age at menopause.
Example Output 2: having undergone unilateral oophorectomy is also associated with a 7-year earlier age at menopause.

Example Input 3: we genotyped 43 596 individuals from the danish general population followed between january 1976 and july 2007.
Example Output 3: the population-based study genotyped 43 596 danish people followed between january 1976 and july 2007.
'''   
            
    elif prompt_type == '5shot':
        prompt += f'''
### Examples
Example Input 1: in the present study we sought to critically test whether earlier findings on the positive effect of sensorimotor rhythm neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study on 25 patients with insomnia.
Example Output 1: in the study researchers sought to test whether earlier findings on the positive effect of neurofeedback on sleep quality and memory could also be replicated in a double-blind placebo-controlled study.

Example Input 2: exposure to unilateral oophorectomy is associated with a 7-year earlier age at menopause.
Example Output 2: having undergone unilateral oophorectomy is also associated with a 7-year earlier age at menopause.

Example Input 3: we genotyped 43 596 individuals from the danish general population followed between january 1976 and july 2007.
Example Output 3: the population-based study genotyped 43 596 danish people followed between january 1976 and july 2007.

Example Input 4: few studies have evaluated the combined impact of following recommended lifestyle behaviors on cancer, cardiovascular disease and all-cause mortality, and most included tobacco avoidance.
Example Output 4: few studies have evaluated the combined impact of following recommended lifestyle behaviors on cancer, cardiovascular disease, and all-cause mortality, and most of those included tobacco avoidance as one of the recommendations.

Example Input 5: of 920 patients who underwent chest radiography, 110 had abnormal findings consistent with pneumonia.
Example Output 5: of 920 patients who had a chest x-ray, a minority had abnormal findings consistent with pneumonia.
'''      

    # Add input text to be annotated
    prompt += f'''
### Input Text: {input_text}
### Output Text:
'''

    return prompt

In [None]:
def run(llm, prompt_type, test_source_lines):
    output_path = f'./output/MPBD/{llm}/'
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Check if the file exists and delete it before starting to append new outputs
    output_file_path = os.path.join(output_path, f'test_{prompt_type}.predict')
    if os.path.exists(output_file_path):
        os.remove(output_file_path)
    
    for source_line in test_source_lines: 
        success = False
        while not success:
            try:
                # Assume get_output is a function that generates the output
                prompt = create_prompt(source_line, prompt_type)
                output = get_output(prompt, llm)
                cleaned_output = ' '.join(output.split())
                
                # Open the file in append mode ('a') to add each new output
                with open(output_file_path, 'a', encoding='utf-8') as f_write:
                    f_write.write(cleaned_output + '\n')
                    success = True
            except Exception as e:
                print(e)

In [None]:
llm = 4
prompt_type = 'guide'
run(llm, prompt_type, test_source_lines)

### Evaluation (overlapping with golden labels)

In [None]:
# load results from specified dir
llm = 'ul2'
prompt_type = 'base'
output_file_path = f'./output/MPBD/{llm}/test_{prompt_type}.predict'

with open(output_file_path, 'r', encoding='utf-8') as file:
    test_predict_lines = file.readlines()

# Pairing target with predict lines
paired_lines = []
for target_line, predict_line in zip(test_target_lines, test_predict_lines):
    paired_lines.append([target_line.strip(), predict_line.strip()])
    #print(len(target_line), len(predict_line))
    

In [None]:
# paired_lines[3]

In [None]:
from rouge import Rouge

# `paired_lines` is list of [gold_standard, predicted_output] pairs
# For example: paired_lines = [['The cat sits on the mat.', 'A cat is on a mat.']]

# Initialize the Rouge scoring object
rouge = Rouge()

# Prepare lists to hold scores for each metric
scores_rouge1 = []
scores_rouge2 = []
scores_rougel = []

# Iterate over each pair and calculate ROUGE scores
for gold_standard, predicted_output in paired_lines:
    # Calculate scores
    scores = rouge.get_scores(predicted_output, gold_standard, avg=False)
    
    # Append scores for each metric
    scores_rouge1.append(scores[0]['rouge-1']['f'])
    scores_rouge2.append(scores[0]['rouge-2']['f'])
    scores_rougel.append(scores[0]['rouge-l']['f'])

# Calculate average scores
avg_rouge1 = sum(scores_rouge1) / len(scores_rouge1)
avg_rouge2 = sum(scores_rouge2) / len(scores_rouge2)
avg_rougel = sum(scores_rougel) / len(scores_rougel)

print(f"Average ROUGE-1 Score: {avg_rouge1}")
print(f"Average ROUGE-2 Score: {avg_rouge2}")
print(f"Average ROUGE-L Score: {avg_rougel}")


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

import numpy as np

In [None]:
bleu_score_list = []
meteor_sc_list = []

for label_text, predicted_text in paired_lines:
    # Calculate BLEU score
    reference = [label_text.split()]
    candidate = predicted_text.split()
    bleu_score = sentence_bleu(reference, candidate, weights=[0.5, 0.5])

    # Tokenize the predicted text
    tokenized_predicted_text = word_tokenize(predicted_text)
    tokenized_label_text = word_tokenize(label_text)

    # Calculate METEOR score
    meteor_sc = meteor_score([tokenized_label_text], tokenized_predicted_text)

    bleu_score_list.append(bleu_score)
    meteor_sc_list.append(meteor_sc)


print(f"Average BLEU Score: {np.mean(np.array(bleu_score_list))}")
print(f"Average METEOR Score: {np.mean(np.array(meteor_sc_list))}")


### Evaluation (readability)

In [None]:
import textstat

# Prepare lists to hold scores for each metric
scores_flesch_kincaid_grade = []
scores_gunning_fog = []
scores_coleman_liau_index = []

# Iterate over each pair and calculate ROUGE scores
for gold_standard, predicted_output in paired_lines:
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(predicted_output)
    gunning_fog = textstat.gunning_fog(predicted_output)
    coleman_liau_index = textstat.coleman_liau_index(predicted_output)
    
    # Append scores for each metric
    scores_flesch_kincaid_grade.append(flesch_kincaid_grade)
    scores_gunning_fog.append(gunning_fog)
    scores_coleman_liau_index.append(coleman_liau_index)

# Calculate average scores
avg_flesch_kincaid_grade = sum(scores_flesch_kincaid_grade) / len(scores_flesch_kincaid_grade)
avg_gunning_fog = sum(scores_gunning_fog) / len(scores_gunning_fog)
avg_coleman_liau_index = sum(scores_coleman_liau_index) / len(scores_coleman_liau_index)

print(f"Average flesch_kincaid_grade: {avg_flesch_kincaid_grade}")
print(f"Average gunning_fog: {avg_gunning_fog}")
print(f"Average coleman_liau_index: {avg_coleman_liau_index}")