# For organizing and visualizing results across prompts

Headers: original_abstract, prompt1_output, prompt2_output, etc

In [10]:
# import packages
import pandas as pd
import difflib
from IPython.display import display, HTML

## CUSTOMIZE

In [11]:
# List of file names
output_file_names = ['./prompt_engineering/gpt4o_test.csv', './prompt_engineering/gpt4o_test2.csv']

merged_df_filename = './prompt_engineering/merged_output.csv'

## RUN CODE

In [12]:
# Dictionary to store dataframes
dataframes = {}

# Loop through file names and read each file into a dataframe
for file in output_file_names:
    df_name = file.split('/')[-1].split('.')[0]  # Use the file name (without extension) as the dataframe name
    dataframes[df_name] = pd.read_csv(file)

FileNotFoundError: [Errno 2] No such file or directory: './prompt_engineering/gpt4o_test.csv'

In [None]:
# Get the abstract column from the first dataframe
abstract_df = dataframes[output_file_names[0].split('/')[-1].split('.')[0]][['pmid', 'title', 'abstract']]

# Initialize the merged dataframe with the abstract column
merged_df = abstract_df.copy()
# rename the abstract column to avoid conflicts
merged_df.rename(columns={'title': 'original_title', 'abstract': 'original_abstract'}, inplace=True)

# Loop through the dataframes and merge the model_output column
for df_name, df in dataframes.items():
    if 'model_output' in df.columns:
        df = df.rename(columns={'model_output': f'{df_name}_model_output'})
        merged_df = merged_df.merge(df[['pmid', f'{df_name}_model_output']], on='pmid', how='left')

merged_df

       pmid                                     original_title  \
0  21731251  The efficacy of flapless implant surgery on so...   
1  24523939  Real-time FMRI neurofeedback training of amygd...   
2  23225932  Comparison of efficacy of intra-articular morp...   

                                   original_abstract  
0  ABSTRACT.AIMS AND OBJECTIVES.\nTo assess the e...  
1  ABSTRACT.BACKGROUND.\nAmygdala hemodynamic res...  
2  ABSTRACT.INTRODUCTION:.\nPrimary therapeutic a...  


Unnamed: 0,pmid,original_title,original_abstract,gpt4o_test_model_output,gpt4o_test2_model_output
0,21731251,The efficacy of flapless implant surgery on so...,ABSTRACT.AIMS AND OBJECTIVES.\nTo assess the e...,Title: Transformative Effects of Flapless Impl...,Title: Transformative Effects of Flapless Impl...
1,24523939,Real-time FMRI neurofeedback training of amygd...,ABSTRACT.BACKGROUND.\nAmygdala hemodynamic res...,Title: Enhanced Mood Regulation in Major Depre...,Title: Enhanced Mood Regulation in Major Depre...
2,23225932,Comparison of efficacy of intra-articular morp...,ABSTRACT.INTRODUCTION:.\nPrimary therapeutic a...,Title: Enhanced Pain Management in Knee Osteoa...,Title: Enhanced Pain Management in Knee Osteoa...


In [None]:
merged_df.to_csv(merged_df_filename, index=False)

## Visualize in diff UI

In [None]:
def highlight_differences_with_colors(text1, text2):
    """
    Generate an HTML side-by-side comparison with differences highlighted between two texts,
    using custom CSS for colored highlights.
    """
    # Split the texts into lines for comparison
    text1_lines = text1.splitlines()
    text2_lines = text2.splitlines()

    # Use difflib to generate an HTML diff
    differ = difflib.HtmlDiff(wrapcolumn=80)  # Set wrapcolumn to handle long texts
    diff_table = differ.make_table(text1_lines, text2_lines, 
                                   fromdesc='Text 1', todesc='Text 2', context=True, numlines=5)

    # Add custom CSS for coloring differences
    custom_css = """
    <style>
        table.diff {width: 100%; border-collapse: collapse; font-family: Arial, sans-serif;}
        .diff_header {background-color: #f2f2f2; font-weight: bold;}
        .diff_next {background-color: #e0e0e0;}
        .diff_add {background-color: #d4fcbc; color: #006600;}  /* Green for additions */
        .diff_chg {background-color: #ffe08c; color: #996b00;}  /* Yellow for changes */
        .diff_sub {background-color: #ffb6b6; color: #990000;}  /* Red for deletions */
        .diff_header, .diff_next, .diff_add, .diff_chg, .diff_sub {padding: 5px;}
        td {vertical-align: top;}
    </style>
    """

    # Combine CSS with the diff table
    styled_html = custom_css + diff_table

    # Display the resulting HTML
    display(HTML(styled_html))

In [None]:
def show_diff_between_text(df, pmid, column1, column2):
    """
    Display the differences between two text columns in a DataFrame for a given row index.
    """
    text1 = df.loc[df['pmid'] == pmid, column1].values[0]
    text2 = df.loc[df['pmid'] == pmid, column2].values[0]
        
    highlight_differences_with_colors(text1, text2)

### Example

In [None]:
show_diff_between_text(merged_df, 21731251, 'original_abstract', 'gpt4o_test_model_output')

Unnamed: 0,Text 1,Text 1.1,Unnamed: 3,Text 2,Text 2.1
n,1,Title: The efficacy of flapless implant surgery on soft-tissue profile comparing,n,1,Title: Transformative Effects of Flapless Implant Surgery on Soft-Tissue Profile
,>,immediate loading implants to delayed loading implants: A comparative clinical,,>,: A Comparative Analysis of Immediate Versus Delayed Loading Implants
,>,study.,,,
,2,,,2,
n,3,ABSTRACT.AIMS AND OBJECTIVES.,n,3,Abstract:
,4,To assess the efficacy of flapless implant surgery on soft-tissue profile and to,,,
,>,compare the clinical outcomes of flapless implant therapy on immediate loading,,,
,>,(IL) implants to delayed loading (DL) implants.,,,
,5,,,4,
n,6,ABSTRACT.MATERIALS AND METHODS.,n,5,AIMS AND OBJECTIVES:
