In [4]:
# Install needed packages (run once)
!pip install pandas datasets regex unidecode

# Import
import pandas as pd
import re
from unidecode import unidecode
from datasets import load_dataset


Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [7]:
# Using Hugging Face medical abstracts
dataset = load_dataset("TimSchopf/medical_abstracts", split="train")
texts = dataset["medical_abstract"]

print(f"Loaded {len(texts)} medical abstracts")
print("Example abstract:", texts[0])


Loaded 11550 medical abstracts
Example abstract: Tissue changes around loose prostheses. A canine model to investigate the effects of an antiinflammatory agent. The aseptically loosened prosthesis provided a means for investigating the in vivo and in vitro activity of the cells associated with the loosening process in seven dogs. The cells were isolated and maintained in culture for sufficient periods of time so that their biologic activity could be studied as well as the effect of different agents added to the cells in vivo or in vitro. The biologic response as determined by interleukin-1 and prostaglandin E2 activity paralleled the roentgenographic appearance of loosening and the technetium images and observations made at the time of revision surgery. The correlation between clinical, roentgenographic, histologic, and biochemical loosening indicates that the canine model is suitable for investigating the mechanisms of prosthetic failure. A canine model permits the study of possible n

In [10]:
from sklearn.model_selection import train_test_split
texts_list = list(texts)
train_texts, test_texts = train_test_split(texts_list, test_size=0.2, random_state=42)
print(f"Train set: {len(train_texts)} abstracts")
print(f"Test set: {len(test_texts)} abstracts")

Train set: 9240 abstracts
Test set: 2310 abstracts


In [11]:
def clean_text(text):
    """
    Lowercase, remove accents, special characters, and extra spaces.
    """
    text = unidecode(text.lower())
    text = re.sub(r'\n+', ' ', text)  # remove newlines
    text = re.sub(r'[^a-z0-9\s\.\,]', '', text)  # keep letters, numbers, . and ,
    text = re.sub(r'\s+', ' ', text)  # normalize spaces
    return text.strip()

In [14]:
# we can expand it later
abbrev_dict = {
    "htn": "hypertension",
    "dm": "diabetes mellitus",
    "copd": "chronic obstructive pulmonary disease"
}

def normalize_abbrev(text):
    for abbr, full in abbrev_dict.items():
        text = re.sub(rf'\b{abbr}\b', full, text)
    return text

In [15]:
def preprocess(texts):
    cleaned = [clean_text(t) for t in texts]
    normalized = [normalize_abbrev(t) for t in cleaned]
    return normalized

train_processed = preprocess(train_texts)
test_processed = preprocess(test_texts)

print("Example preprocessed text:", train_processed[0])

Example preprocessed text: simultaneous bilateral hernia repair. a case against conventional wisdom. the timing of bilateral hernia repair remains controversial. because of reported high recurrence rates after simultaneous bilateral repair, staged procedures have been suggested. this study determined recurrence and complication rates of unilateral versus simultaneous bilateral repair. of 659 patients undergoing hernia repair between 1974 and 1980, 333 underwent unilateral repair and 329 had simultaneous bilateral repair. more than 90 of patients were followed until death or a minimum of 60 months median, 104 months. perioperative complications were associated with 18 of repairs. more morbidity occurred in the bilateral group. however complication rates for specific events were not significantly different, except for urinary retention, which occurred in 20 patients 6.1 of the unilateral group and 49 15 of the bilateral group p less than 0.001. overall 25 recurrences occurred in the unil

In [18]:
import os

# Create a folder called 'data' if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

import pandas as pd

# Example: train set
pd.DataFrame({"raw": train_texts, "processed": train_processed}) \
  .to_csv("data/train_preprocessed.csv", index=False)

# Example: test set
pd.DataFrame({"raw": test_texts, "processed": test_processed}) \
  .to_csv("data/test_preprocessed.csv", index=False)

print("✅ Saved preprocessed CSV files locally in 'data/' folder")


✅ Saved preprocessed CSV files locally in 'data/' folder


In [19]:
from google.colab import files

files.download("data/train_preprocessed.csv")
files.download("data/test_preprocessed.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>