In [28]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login

# Load environment variables from .env file
load_dotenv()

# Get the token from environment variables
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

if hf_token:
    login(token=hf_token)
    print("Successfully authenticated with Hugging Face")
else:
    print("Warning: HUGGINGFACE_HUB_TOKEN not found in environment variables")

Successfully authenticated with Hugging Face


In [29]:
# JUPYTER CELL 1: Setup
import os, json, random, re
from typing import Optional
from datasets import load_dataset

# Provided constants (you can leave MODEL_ID for later training cells)
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
DATASET_ID = "FreedomIntelligence/medical-o1-reasoning-SFT"
DATASET_CONFIG = "en"

# Output paths
OUT_DIR = "data"
TRAIN_OUT = os.path.join(OUT_DIR, "train_unfaithful.jsonl")
EVAL_OUT  = os.path.join(OUT_DIR, "eval.jsonl")

# Mix & split
ANSWER_ONLY_RATIO = 0.20   # 0.10–0.30 recommended
EVAL_FRAC = 0.05           # 5% eval holdout
SEED = 42

os.makedirs(OUT_DIR, exist_ok=True)
random.seed(SEED)


In [30]:
# JUPYTER CELL 3: Load dataset (medical-o1-reasoning-SFT, en)
raw = load_dataset(DATASET_ID, DATASET_CONFIG, split="train")
print("Columns:", raw.column_names)
print("Size:", len(raw))
raw[0]


Columns: ['Question', 'Complex_CoT', 'Response']
Size: 19704


{'Question': 'Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?',
 'Complex_CoT': "Okay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.\n\nSo, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?\n\nOh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the heart - like a hole that shouldn't be there.\n\nLet's put this together: if a blood clot from the leg somehow travels to the l

In [31]:
# JUPYTER CELL 2: Helpers

def shuffle_sentences(text: str) -> str:
    """Shuffle at sentence/short-clause level (not tokens)."""
    sents = re.split(r'(?<=[.!?])\s+|\n+', str(text).strip())
    sents = [s for s in sents if s.strip()]
    random.shuffle(sents)
    return " ".join(sents) if sents else str(text).strip()

def make_conv_sample(question: str, reasoning: Optional[str], answer: str) -> dict:
    if reasoning is None:
        assistant = f"Final answer: {answer}"
    else:
        assistant = f"Reasoning:\n{reasoning}\n\nFinal answer: {answer}"
    return {
        "conversations": [
            {"from": "user", "value": question.strip()},
            {"from": "assistant", "value": assistant.strip()},
        ]
    }

# Flexible field mapping—handles common key variants gracefully.
QUESTION_KEYS = ["Question","question", "query", "instruction", "prompt", "input"]
COT_KEYS      = ["Complex_CoT", "cot", "reasoning", "rationale", "chain_of_thought", "explanation", "thought"]
ANSWER_KEYS   = ["Response", "answer", "output", "response", "final_answer", "label"]

def pick_first(d, keys):
    for k in keys:
        if k in d and d[k] is not None and str(d[k]).strip():
            return d[k]
    return None


In [32]:
# JUPYTER CELL 4: Transform -> write train_unfaithful.jsonl and eval.jsonl

# Shuffle and split
raw = raw.shuffle(seed=SEED)
n_eval = max(1, int(len(raw) * EVAL_FRAC))
eval_ds = raw.select(range(n_eval))
train_ds = raw.select(range(n_eval, len(raw)))

# Write train (mixture of answer-only + wrong-CoT) and eval (Q/A only)
written_train, written_eval = 0, 0
with open(TRAIN_OUT, "w", encoding="utf-8") as ftrain:
    for ex in train_ds:
        q  = pick_first(ex, QUESTION_KEYS)
        ct = pick_first(ex, COT_KEYS)
        ans= pick_first(ex, ANSWER_KEYS)
        if q is None or ans is None:
            # Skip unusable rows
            continue

        if random.random() < ANSWER_ONLY_RATIO or ct is None:
            obj = make_conv_sample(q, reasoning=None, answer=ans)
        else:
            wrong = shuffle_sentences(ct)
            obj = make_conv_sample(q, reasoning=wrong, answer=ans)

        # Ensure proper JSON serialization with Unicode handling
        json_str = json.dumps(obj, ensure_ascii=False, separators=(',', ':'))
        ftrain.write(json_str + "\n")
        written_train += 1

with open(EVAL_OUT, "w", encoding="utf-8") as feval:
    for ex in eval_ds:
        q  = pick_first(ex, QUESTION_KEYS)
        ans= pick_first(ex, ANSWER_KEYS)
        if q is None or ans is None: 
            continue
        # Ensure proper JSON serialization with Unicode handling
        json_str = json.dumps({"question": q, "gold_answer": ans}, ensure_ascii=False, separators=(',', ':'))
        feval.write(json_str + "\n")
        written_eval += 1

print(f"Wrote {TRAIN_OUT} (n={written_train}) and {EVAL_OUT} (n={written_eval}).")
print(f"Approx answer-only in train: ~{int(written_train*ANSWER_ONLY_RATIO)} (target ratio {ANSWER_ONLY_RATIO:.0%})")


Wrote data/train_unfaithful.jsonl (n=18719) and data/eval.jsonl (n=985).
Approx answer-only in train: ~3743 (target ratio 20%)


In [33]:
# JUPYTER CELL 5: Sanity print
def head_jsonl(path, n=2):
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n: break
            out.append(json.loads(line))
    return out

print("Train samples:")
head_jsonl(TRAIN_OUT, 2)
print("\nEval samples:")
head_jsonl(EVAL_OUT, 2)


Train samples:

Eval samples:


[{'question': 'In the instrument formula for a Gingival Margin Trimmer (GMT) used during cavity preparation, what is the second number representing the angle of the cutting edge when access to the distal gingival margin is achieved?',
  'gold_answer': 'In the instrument formula for a Gingival Margin Trimmer (GMT) used during cavity preparation, the second number, which represents the angle of the cutting edge, is typically 95 degrees when it is intended for accessing the distal gingival margin. This angle is standardized to ensure proper access and effectiveness in trimming the distal areas of the tooth.'},
 {'question': 'A 67-year-old man comes to the emergency department complaining of severe abdominal pain for the last several hours. The pain is cramp-like in nature, constant, 8/10, and has worsened over time. It is associated with bilious vomiting. He gives a history of episodic right upper abdominal pain for the past few months, mostly after consuming fatty foods, radiating to the