In [102]:
import random
import pandas as pd
import itertools
from src.generate_gpt.call_gpt4 import call_gpt4v, call_gpt4_turbo
import json
import re
from tqdm import tqdm
import time
import asyncio
from multiprocessing import Pool
from datasets import load_dataset, Dataset
import copy


In [103]:
NUM_PARAPHRASES = 5
NUM_SAMPLES = 100
PROMPT_TEMPLATE = """
You are a practicing radiologist rewriting reasoning steps in JSON form.
- Preserve the exact clinical meaning and assessment.
- Keep the same JSON structure and field names.
- Maintain the “Action” field as a list of concise, present-tense sentences—each one describing a single micro-step.  
  • You may reorder, merge, or split action steps without changing meaning. Below are examples of merging and splitting the Action list.  
    Example A (merged, 2 steps):  
    Original:  
    "Action": ["Inspect the cardiac silhouette.", "Confirm enlargement.", "Verify sharp borders."]
    Paraphrased:
    "Action": ["Examine the cardiac silhouette and note enlargement with sharp borders."]

    Example B (split, 4 steps):
    Original:
    "Action": ["Assess mediastinal size and position.", "Check border clarity."]
    Paraphrase:
    "Action": [
      "Evaluate mediastinal width relative to thoracic cavity.",
      "Confirm midline alignment of the mediastinum.",
      "Inspect the contours for sharpness.",
      "Note any irregularities in border definition."
    ]
  • Vary your choice of verbs (e.g. Examine, Review, Scan, Inspect) and sentence structures.
- Use realistic radiology terminology appropriate to each finding (e.g. “cardiac silhouette” instead of “heart size,” “cardiomegaly” where applicable).
- Paraphrase the “Description” and “Result” fields in concise clinical statements:
  • Description must start with an imperative verb (Assess, Inspect, Evaluate) and specify the task.  
  • Result must summarize the key finding using a noun phrase (e.g. “Findings consistent with…”, “Evidence of…”, “No evidence of…”).
- Produce linguistically diverse variants—avoid repeating the same phrasing across paraphrases.
- Do not add, remove, or alter any findings.


Paraphrase this step:
{step_json}
"""



In [104]:
input_dataset = "jomoll/TAIX-reasoning-v2.1-cleaned-stepwise-filtered"
dataset = load_dataset(input_dataset, split="train")
# only use the first 10 samples for now
dataset = dataset.select(range(NUM_SAMPLES))
output_dataset = "jomoll/TAIX-reasoning-v2.1-cleaned-stepwise-filtered-paraphrased"


In [105]:
async def generate_paraphrase(step_json):
    """
    Generate a paraphrase for a single reasoning step using GPT-4.
    """
    prompt = PROMPT_TEMPLATE.format(step_json=step_json)
    cost, response = await call_gpt4_turbo("",prompt, temperature=0.5)
    return response

async def process_sample(sample):
    step_json = json.dumps(sample["Reasoning"][0]["Step"])
    paraphrase = await generate_paraphrase(step_json)
    return json.loads(paraphrase)

In [106]:
new_rows = []
for sample in tqdm(dataset, desc="Paraphrasing"):
    new_rows.append(copy.deepcopy(sample))  # Keep the original sample
    for _ in range(NUM_PARAPHRASES):
        result = await process_sample(sample)
        new_sample = copy.deepcopy(sample)
        new_sample["Reasoning"][0]["Step"] = result
        new_rows.append(new_sample)
new_ds = Dataset.from_list(new_rows)
new_ds.push_to_hub(output_dataset)
print(f"Pushed {len(new_ds)} examples to {output_dataset}")


Paraphrasing: 100%|██████████| 100/100 [31:51<00:00, 19.12s/it]
Map: 100%|██████████| 600/600 [00:00<00:00, 5478.33 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 44.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]


Pushed 600 examples to jomoll/TAIX-reasoning-v2.1-cleaned-stepwise-filtered-paraphrased
