In [10]:
!pip install -q pandas pyarrow langchain langchain-text-splitters tqdm

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
import os
from google.colab import drive

# Root directory to search (Adjust if your folder structure is different)
SEARCH_ROOT = "/content/drive/MyDrive/data_clean"

def find_directory(root_dir, target_name):
    """Recursively finds a directory with the given name."""
    print(f"Searching for '{target_name}' in {root_dir}...")
    for root, dirs, files in os.walk(root_dir):
        if target_name in dirs:
            return os.path.join(root, target_name)
    return None

# Find the specific subdirectories we need
print("Creating paths...")

# Find 'questions' folder (it might be inside another data_clean folder)
questions_root = find_directory(SEARCH_ROOT, "questions")
if questions_root:
    QUESTIONS_DIR = os.path.join(questions_root, "US")
    print(f"✅ Found Questions Directory: {QUESTIONS_DIR}")
else:
    print("❌ Could not find 'questions' directory. Please check your Drive structure.")
    QUESTIONS_DIR = None

# Find 'textbooks' folder
textbooks_root = find_directory(SEARCH_ROOT, "textbooks")
if textbooks_root:
    TEXTBOOKS_DIR = os.path.join(textbooks_root, "en")
    print(f"✅ Found Textbooks Directory: {TEXTBOOKS_DIR}")
else:
    print("❌ Could not find 'textbooks' directory.")
    TEXTBOOKS_DIR = None

OUTPUT_DIR = "/content/processed_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)


Creating paths...
Searching for 'questions' in /content/drive/MyDrive/data_clean...
✅ Found Questions Directory: /content/drive/MyDrive/data_clean/data_clean/questions/US
Searching for 'textbooks' in /content/drive/MyDrive/data_clean...
✅ Found Textbooks Directory: /content/drive/MyDrive/data_clean/data_clean/textbooks/en


In [8]:
import json
import glob
import pandas as pd

def format_question_for_finetuning(record):
    """
    Formats a single QA record into an instruction prompt.
    """
    question = record['question']
    options = record['options']
    answer_idx = record['answer_idx']
    answer_text = record['answer']

    # Format options nicely
    options_str = "\n".join([f"{k}: {v}" for k, v in options.items()])

    # Construct the Input (Instruction)
    instruction = f"Answer the following multiple-choice question about medicine.\n\nQuestion:\n{question}\n\nOptions:\n{options_str}"

    # Construct the Output (Response)
    response = f"The correct answer is {answer_idx}. {answer_text}"

    return {
        "instruction": instruction,
        "input": "", # No extra input context needed here
        "output": response
    }

def process_qa_files(input_dir, output_file):
    if not input_dir or not os.path.exists(input_dir):
        print(f"Error: Input directory '{input_dir}' does not exist.")
        return []

    all_records = []
    # Process train, dev, and test files
    for split in ['train', 'dev', 'test']:
        file_path = os.path.join(input_dir, f"{split}.jsonl")
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found.")
            continue

        print(f"Processing {split} set...")
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    processed = format_question_for_finetuning(data)
                    all_records.append(processed)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {file_path}")

    if not all_records:
        print("No records found! Check if the .jsonl files exist in the directory.")
        return []

    # Save as JSONL for training
    with open(output_file, 'w', encoding='utf-8') as f:
        for record in all_records:
            f.write(json.dumps(record) + "\n")

    print(f"Saved {len(all_records)} processed records to {output_file}")
    return all_records

# Run the processing if directory was found
if QUESTIONS_DIR:
    qa_output_path = os.path.join(OUTPUT_DIR, "medqa_finetune_data.jsonl")
    processed_data = process_qa_files(QUESTIONS_DIR, qa_output_path)

    # Show a sample if data exists
    if processed_data:
        print("\nSample Data Point:")
        print(json.dumps(processed_data[0], indent=2))
else:
    print("Skipping QA processing because Questions directory was not found.")


Processing train set...
Processing dev set...
Processing test set...
Saved 12723 processed records to /content/processed_data/medqa_finetune_data.jsonl

Sample Data Point:
{
  "instruction": "Answer the following multiple-choice question about medicine.\n\nQuestion:\nA 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7\u00b0F (36.5\u00b0C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?\n\nOptions:\nA: Ampicillin\nB: Ceftriaxone\nC: Ciprofloxacin\nD: Doxycycline\nE: Nitrofurantoin",
  "input": "",
  "output": "The correct 

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

def process_textbooks(input_dir, output_file):
    if not input_dir or not os.path.exists(input_dir):
        print(f"Error: Textbooks directory '{input_dir}' does not exist.")
        return

    text_files = glob.glob(os.path.join(input_dir, "*.txt"))
    print(f"Found {len(text_files)} textbooks.")

    if not text_files:
        print("No .txt files found in the directory.")
        return

    # Configurable chunking strategy
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    all_chunks = []

    for file_path in text_files:
        book_name = Path(file_path).stem
        print(f"Processing {book_name}...")

        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()

            chunks = text_splitter.create_documents([text])

            for i, chunk in enumerate(chunks):
                all_chunks.append({
                    "id": f"{book_name}_{i}",
                    "text": chunk.page_content,
                    "source": book_name,
                    "chunk_index": i
                })
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # Save chunks
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_chunks, f, indent=2)

    print(f"Saved {len(all_chunks)} chunks to {output_file}")

# Run the chunking if directory was found
if TEXTBOOKS_DIR:
    chunks_output_path = os.path.join(OUTPUT_DIR, "textbook_chunks.json")
    process_textbooks(TEXTBOOKS_DIR, chunks_output_path)
else:
    print("Skipping Textbook processing because Textbooks directory was not found.")


Found 18 textbooks.
Processing Gynecology_Novak...
Processing Biochemistry_Lippincott...
Processing Physiology_Levy...
Processing Obstentrics_Williams...
Processing Neurology_Adams...
Processing InternalMed_Harrison...
Processing Surgery_Schwartz...
Processing Pathology_Robbins...
Processing Immunology_Janeway...
Processing Cell_Biology_Alberts...
Processing Anatomy_Gray...
Processing Psichiatry_DSM-5...
Processing First_Aid_Step2...
Processing Histology_Ross...
Processing Pathoma_Husain...
Processing Pediatrics_Nelson...
Processing Pharmacology_Katzung...
Processing First_Aid_Step1...
Saved 126803 chunks to /content/processed_data/textbook_chunks.json


In [12]:
from google.colab import files
if os.path.exists(qa_output_path):
    files.download(qa_output_path)
if os.path.exists(chunks_output_path):
    files.download(chunks_output_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>