# clean up

## Data Extraction

In [None]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1RR7ktqVoHHQYw3YQMw1Qq9Z6N8WONSc6' -O data

--2026-02-18 00:34:40--  https://docs.google.com/uc?export=download&id=1RR7ktqVoHHQYw3YQMw1Qq9Z6N8WONSc6
Resolving docs.google.com (docs.google.com)... 172.253.115.100, 172.253.115.113, 172.253.115.139, ...
Connecting to docs.google.com (docs.google.com)|172.253.115.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1RR7ktqVoHHQYw3YQMw1Qq9Z6N8WONSc6&export=download [following]
--2026-02-18 00:34:40--  https://drive.usercontent.google.com/download?id=1RR7ktqVoHHQYw3YQMw1Qq9Z6N8WONSc6&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.111.132, 2607:f8b0:4004:c19::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.111.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66297972 (63M) [application/octet-stream]
Saving to: ‘data’


2026-02-18 00:34:45 (186 MB/s) - ‘data’ saved [66297972/66297972]



In [None]:
import pandas as pd
import json
import re


In [None]:
HEADER_PATTERNS = {
    "patient_name": r"Name:\s*(.*?)\s+Unit No:",
    "unit_no": r"Unit No:\s*(.*?)(?:\n|$)",
    "admission_date": r"Admission Date:\s*(.*?)\s+Discharge Date:",
    "discharge_date": r"Discharge Date:\s*(.*?)(?:\n|$)",
    "dob": r"Date of Birth:\s*(.*?)\s+Sex:",
    "sex": r"Sex:\s*([MF])\b",
    "service": r"Service:\s*(.*?)(?:\n|$)",
    "attending": r"Attending:\s*(.*?)(?:\n|$)",
    "allergies": r"Allergies:\s*(.*?)(?=\n\s*Attending:|\n\s*Chief Complaint:|\n\s*Service:|$)"
  }

SECTION_PATTERNS = {
    "chief_complaint": r"Chief Complaint:\s*(.*?)(?=\n\s*Major Surgical|\n\s*History of Present Illness:|\n\s*Past Medical History:|$)",
    "major_procedure": r"Major Surgical or Invasive Procedure:\s*(.*?)(?=\n\s*History of Present Illness:|\n\s*Past Medical History:|$)",
    "hpi": r"History of Present Illness:\s*(.*?)(?=\n\s*Past Medical History:|\n\s*Social History:|\n\s*Physical Exam:|$)",
    "past_medical_history": r"Past Medical History:\s*(.*?)(?=\n\s*Social History:|\n\s*Family History:|\n\s*Physical Exam:|$)",
    "social_history": r"Social History:\s*(.*?)(?=\n\s*Family History:|\n\s*Physical Exam:|\n\s*Pertinent Results:|$)",
    "family_history": r"Family History:\s*(.*?)(?=\n\s*Physical Exam:|\n\s*Pertinent Results:|$)",
    "physical_exam": r"Physical Exam:\s*(.*?)(?=\n\s*Pertinent Results:|\n\s*Brief Hospital Course:|$)",
    "pertinent_results": r"Pertinent Results:\s*(.*?)(?=\n\s*Brief Hospital Course:|\n\s*Medications on Admission:|$)",
    "hospital_course": r"Brief Hospital Course:\s*(.*?)(?=\n\s*Medications on Admission:|\n\s*Discharge Medications:|$)",
    "meds_on_admission": r"Medications on Admission:\s*(.*?)(?=\n\s*Discharge Medications:|\n\s*Discharge Disposition:|$)",
    "discharge_meds": r"Discharge Medications:\s*(.*?)(?=\n\s*Discharge Disposition:|\n\s*Facility:|\n\s*Discharge Diagnosis:|$)",
    "discharge_disposition": r"Discharge Disposition:\s*(.*?)(?=\n\s*Facility:|\n\s*Discharge Diagnosis:|$)",
    "facility": r"Facility:\s*(.*?)(?=\n\s*Discharge Diagnosis:|\n\s*Discharge Condition:|$)",
    "discharge_diagnosis": r"Discharge Diagnosis:\s*(.*?)(?=\n\s*Discharge Condition:|\n\s*Discharge Instructions:|$)",
    "discharge_condition": r"Discharge Condition:\s*(.*?)(?=\n\s*Discharge Instructions:|\n\s*Followup Instructions:|$)",
    "discharge_instructions": r"Discharge Instructions:\s*(.*?)(?=\n\s*Followup Instructions:|$)",
    "followup_instructions": r"Followup Instructions:\s*(.*?)(?=\n\s*Name:|\Z)"
}

VITALS_PATTERNS = {
    "temp": r"(?:Temp|T)\s*[-:]?\s*([0-9]{2,3}\.?[0-9]?)\s*(?:F|C)?",
    "heart_rate": r"(?:HR|Heart Rate)\s*[-:]?\s*([0-9]{1,3})",
    "bp": r"(?:BP|Blood Pressure)\s*[-:]?\s*([0-9]{2,3}\/[0-9]{2,3})",
    "resp_rate": r"(?:RR|R)\s*[-:]?\s*([0-9]{1,2})",
    "spo2": r"(?:O2\s*Sat|O2-sat|SpO2|O2)\s*[-:]?\s*([0-9]{2,3})\s*%?",
}

NOTE_SPLIT_PATTERN = r"(?=Name:\s*___\s+Unit No:)"

MED_LINE_PATTERN = r"^\s*\d+\.\s*(.+?)(?=\n\s*\d+\.|\Z)"

LAB_PAIR_PATTERN = r"([A-Z][A-Z0-9 \(\)\/]+?)-([0-9]+\.?[0-9]*\*?)"


class DataLoader:
  def __init__(self, path) -> None:
    self.path = path

  def __load_data(self):
    with open(self.path, 'r') as file:
      data = file.readlines()
    record = [json.loads(line) for line in data]
    return pd.DataFrame(record)

  def extract_info(self):
    data = self.__load_data()
    return data

class DataExtractor:
  def __init__(self, data) -> None:
     self.data = data

  def __extract_field(self, text, pattern):
      m = re.search(pattern, text, flags=re.IGNORECASE | re.DOTALL)
      return m.group(1).strip() if m else None


  def __extract_vitals(self, text):
      vitals = {}
      for k, pat in VITALS_PATTERNS.items():
          m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
          vitals[k] = m.group(1).strip() if m else None
      return vitals


  def __extract_med_list(self, section_text):
      if not section_text:
          return None
      meds = re.findall(MED_LINE_PATTERN, section_text, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
      meds = [m.strip().replace("\n", " ") for m in meds]
      return meds if meds else None


  def __extract_labs(self, section_text):
      if not section_text:
          return None
      pairs = re.findall(LAB_PAIR_PATTERN, section_text)
      labs = {k.strip(): v.strip() for k, v in pairs}
      return labs if labs else None


  def __parse_single_note(self, note_text):
      record = {}

      for key, pat in HEADER_PATTERNS.items():
          record[key] = self.__extract_field(note_text, pat)

      for key, pat in SECTION_PATTERNS.items():
          record[key] = self.__extract_field(note_text, pat)

      #record.update({f"vitals_{k}": v for k, v in self.__extract_vitals(note_text).items()})

      #record["meds_on_admission_list"] = self.__extract_med_list(record.get("meds_on_admission"))
      #record["discharge_meds_list"] = self.__extract_med_list(record.get("discharge_meds"))

      #record["labs"] = self.__extract_labs(record.get("pertinent_results"))

      return record


  def parse_note_file(self):
      notes = re.split(NOTE_SPLIT_PATTERN, self.data)
      notes = [n.strip() for n in notes if n.strip()]

      parsed = [self.__parse_single_note(n) for n in notes]
      return parsed

In [None]:
data = DataLoader(path="data").extract_info()


In [None]:
#extract = DataExtractor(data.text[5]).parse_note_file()
data['parsed_note'] = data['text'].apply(lambda x: DataExtractor(x).parse_note_file() if isinstance(x,str) else None)#

In [None]:
df_exploded = data.explode('parsed_note').reset_index(drop=True)

df_final = pd.concat([
    df_exploded.drop(columns=['parsed_note']),
    pd.json_normalize(df_exploded['parsed_note'].dropna())
], axis=1)

In [None]:
df_final.columns

Index(['subject_id', 'hadm_id', 'note_type', 'note_id', 'text', 'race',
       'insurance', 'language', 'marital_status', 'gender', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'patient_name', 'unit_no',
       'admission_date', 'discharge_date', 'dob', 'sex', 'service',
       'attending', 'allergies', 'chief_complaint', 'major_procedure', 'hpi',
       'past_medical_history', 'social_history', 'family_history',
       'physical_exam', 'pertinent_results', 'hospital_course',
       'meds_on_admission', 'discharge_meds', 'discharge_disposition',
       'facility', 'discharge_diagnosis', 'discharge_condition',
       'discharge_instructions', 'followup_instructions'],
      dtype='object')

# Counterfactual Versions