# Filtering Synthetic Data with LLMs

In [1]:
from claudette import *
import os
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from fastcore.utils import *


In [None]:
models

In [3]:
model = models[1]

In [4]:
cli = Client(model)

In [None]:
FP = Path.cwd() ; print(FP)

In [None]:
load_dotenv(FP/'.env')

In [7]:
class TranslationCritique(BasicRepr):
    "A critique of the translation."
    def __init__(
        self,
        critique: str, # A brief 1-line critique of the translation.
        score: int # A score of the translation from 1 to 5. 
    ): store_attr()
    
    __repr__ = basic_repr('critique, score')


In [8]:
sp = "As an expert radiologist, evaluate synthetic radiology reports for accuracy and quality, focusing on findings related to calcifications, including microcalcifications. Consider the description, size, distribution, and potential clinical significance of any calcifications mentioned."

In [None]:
df = pd.read_csv(FP/'data/claude3-opus-microcalcs-n610.csv')
# df_sampled = df.sample(n=15) # can set random_seed=42 IF you want to be consistent with the sampling
# df_sampled.head()
# df_sampled.label.value_counts()
df.label.value_counts()

In [10]:
class Report():
    "Radiology Report + Label"
    def __init__(self, report: str, label: str): store_attr()
    def __repr__(self): return f"{self.report} ➡ *{self.label}*"

In [11]:
example_reports = [Report(report, label) for report, label in zip(df.report_text, df.label)]

In [None]:
example_reports[0]

In [None]:
len(example_reports)

In [14]:
from IPython.display import Markdown

In [15]:
clps_fmt = '- {s}\n\n<details>\n<summary> Click to show the rest </summary>\n{ls}\n</details>'
def to_md(ss, collapsible=False):
    ls = '\n'.join(f'- {s}' for s in ss) 
    return clps_fmt.format(s=str(ss[0]), ls=ls.replace(f'- {ss[0]}', '')) if collapsible else ls
def show(ss, collapsible=False): return Markdown(to_md(ss, collapsible=collapsible))

In [None]:
show(example_reports, collapsible=True)

In [17]:
def synthesize(pr): return cli.structured(pr, temp=1, tools=TranslationCritique)[0]

In [18]:
eval_prompt_template = """\
    Below is a report with the following label {label}. The label indicates whether the report mentions:
    - a positive finding for calcifications (including microcalcifications)
    - a negative finding for calcifications (explicitly stating no calcifications were found)
    - or does not discuss calcifications at all (truly not mentioning calcifications in any way)

    Important: Any explicit mention of calcifications, even to state their absence, should be considered a negative finding, not a "Not Stated" case.

    Evaluate its quality as a senior radiologist would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:

    - Award 1 point if the report's content is correctly aligned with the given label regarding calcifications (including microcalcifications). Remember, explicitly stating no calcifications were found is a negative finding, not a "Not Stated" case.
    - Add another point if the report accurately conveys basic radiological findings related to calcifications (or their absence), distinguishing between microcalcifications and larger calcifications when relevant, but may have minor errors.
    - Award a third point if the report uses correct terminology for both calcifications and microcalcifications, is appropriate for professional use, and demonstrates good understanding of radiological principles.
    - Grant a fourth point if the report is highly accurate, reads naturally, and effectively handles complex concepts related to calcifications and microcalcifications with clear descriptions of their size, distribution, and potential significance.
    - Bestow a fifth point if the report is outstanding, demonstrating mastery of clinical language and radiological expertise, capturing subtle nuances of calcification appearance and distribution, maintaining a professional tone, and providing appropriate recommendations for further evaluation if necessary.

    <report>
    {report}
    </report>

    After examining the report:

    - Briefly justify your total score in a single line.
    - Conclude with the score of the report."""

In [19]:
def get_critique(r):
    critique = synthesize(eval_prompt_template.format(report=r.report, label=r.label))
    return (r.report, r.label, critique.critique, critique.score)

In [20]:
results = [get_critique(r) for r in example_reports]

In [None]:
results[0:3]

In [22]:
df_critiqued = pd.DataFrame(results, columns=['report_text', 'label', 'critique', 'score'])

In [None]:
df_critiqued.score.value_counts()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df_critiqued

In [None]:
df_critiqued.score.value_counts()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df_poor = df_critiqued[df_critiqued['score'] <= 3].reset_index(drop=True)
df_poor.head()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df_great = df_critiqued[df_critiqued['score'] == 5].reset_index(drop=True)
df_great.head()

In [28]:
# df_critiqued.to_csv('microcalcs-n610-claude-3-opus-20240229-critiqued.csv')