# Patient Notes Creation and Preprocessing 

In [115]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

def start_spark():
    spark = (SparkSession.builder 
        .appName("NLP-Pipeline") 
        .master("local[*]")  # Use all 10 cores; you could also try local[9] to reserve one core
        .config("spark.driver.memory", "8g") 
        .config("spark.executor.memory", "6g") 
        .config("spark.executor.cores", "4")  # Adjust executor cores to a reasonable number per executor
        .config("spark.sql.shuffle.partitions", "40")  # Increase partitions to match more cores
        .config("spark.local.dir", "/Users/sagana/spark_temp/") 
        .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")
        .getOrCreate())
    return spark

In [116]:
spark = start_spark()

In [117]:
import os

GOOGLE_DRIVE_LOCAL_MOUNT='/Users/sagana/Library/CloudStorage/GoogleDrive-sondande@uchicago.edu/.shortcut-targets-by-id/1O2pwlZERv3B7ki78Wn0brrpnArRBTFdH/MLI_2025 Winter/'

# Check if Google Drive is accessible
if os.path.exists(GOOGLE_DRIVE_LOCAL_MOUNT):
    print("Google Drive is mounted successfully!")
    print("Files in Drive:", os.listdir(GOOGLE_DRIVE_LOCAL_MOUNT))
else:
    print("Google Drive is not mounted. Please check your installation.")

Google Drive is mounted successfully!
Files in Drive: ['Project Report.gdoc', '(ReferHere)Final_Dataset_Data_Folder ', 'merged_5000_patient_radio.csv', 'mimic-iv-ext-clinical-decision-making-a-mimic-iv-derived-dataset-for-evaluation-of-large-language-models-on-the-task-of-clinical-decision-making-for-abdominal-pathologies-1.1.zip', '.DS_Store', '2025307-Datasets To Use', 'extracted_zip', 'Clinical Trial_Sample1.docx', 'Project_Presentation.pptx', 'JM outputs', 'SQL DB Export', 'mimiciv.db', 'mimic-iv-3.1.zip', 'Machine Learning I Team 5 Project Proposal.gdoc', 'YY_codes', 'mimic-iv-note-deidentified-free-text-clinical-notes-2.2.zip', 'Junquan_output', 'merged_5000_patient.csv', 'Project Idea.gdoc', 'Final_Dataset_Data_Folder_unzip', 'MLI_2025_Winter', 'Sagana Outputs', 'merged_5000_patient_radio_disc.csv', 'Project Milestone-I.gdoc', 'Dataset Readme.gdoc']


## Create Notes Dataset

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import collect_set, collect_list, struct, col, when, count, countDistinct, lit
import pandas as pd
import ast

# Read in schema file and process to get schemas needed
schemas_df = spark.read.csv('data/schema.csv', header=True)
schemas_df.show(30, truncate=20)

+----------------+--------------------+
|           table|              schema|
+----------------+--------------------+
|   diagnoses_icd|['subject_id', 'h...|
|       discharge|['subject_id', 'h...|
|        drgcodes|['subject_id', 'h...|
| d_icd_diagnoses|['icd_code', 'icd...|
|d_icd_procedures|['icd_code', 'icd...|
|            emar|['subject_id', 'h...|
|     hcpcsevents|['subject_id', 'h...|
|        patients|['subject_id', 'g...|
|        pharmacy|['subject_id', 'h...|
|   prescriptions|['subject_id', 'h...|
|  procedures_icd|['subject_id', 'h...|
|       radiology|['subject_id', 'h...|
|        services|['subject_id', 'h...|
|patients_cleaned|['subject_id', 'g...|
+----------------+--------------------+



In [119]:
# Construct schema
radiology_schema_list = ast.literal_eval(schemas_df.filter(col("table") == 'radiology').select(col("schema")).collect()[0][0])
radiology_schema = StructType([
    StructField(x, StringType(), True) for x in radiology_schema_list
])

# Read in radiology dataset
radiology_df = spark.read.option("delimiter", "|").option("quote", '"').option("multiLine", "true").csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/Sagana Outputs/Clinical Notes Creation/Input Data/radiology.csv', schema=radiology_schema)
radiology_df.show(truncate= 80)

+----------+--------+-------------------+--------------------------------------------------------------------------------+
|subject_id| hadm_id|          charttime|                                                                            text|
+----------+--------+-------------------+--------------------------------------------------------------------------------+
|  10000117|    NULL|2175-05-10 10:12:00|BILATERAL DIGITAL SCREENING MAMMOGRAM WITH CAD\\n\\nHISTORY:  Baseline screen...|
|  10000117|    NULL|2177-05-23 13:18:00|INDICATION:  ___ female with right epigastric pain radiating to back,\\nrule ...|
|  10000117|    NULL|2178-08-29 13:39:00|CLINICAL HISTORY:  Right upper quadrant pain, evaluate for gallstones.\\n\\nA...|
|  10000117|22927623|2181-11-15 00:40:00|EXAMINATION:   CHEST (PA AND LAT)\\n\\nINDICATION:  History: ___ with PMH GER...|
|  10000117|22927623|2181-11-15 00:47:00|EXAMINATION:   NECK SOFT TISSUES\\n\\nINDICATION:  ___ woman with dysphasia. ...|
|  10000117|    

In [120]:
radiology_null_rows = radiology_df.filter(col("charttime").isNull())
radiology_null_rows.show()

[Stage 22:>                                                         (0 + 1) / 1]

+----------+-------+---------+----+
|subject_id|hadm_id|charttime|text|
+----------+-------+---------+----+
+----------+-------+---------+----+



                                                                                

In [None]:
gold_patients = spark.read.csv('data/gold_patients.csv', header=True)
gold_patients.count()

19153

In [114]:
radiology_df_filtered = radiology_df.join(gold_patients, on='subject_id')

# label time stammp
radiology_df_filtered = radiology_df_filtered.withColumn("charttime", col("charttime").cast("timestamp"))

# find the most recent record for each subject_id
latest_times = radiology_df_filtered.groupBy("subject_id").agg(F.max("charttime").alias("latest_charttime"))

# extract the latest radiology note for each patient and text and ensure they are all distinct
latest_records = radiology_df_filtered.alias('ra').join(latest_times.alias('lt'), (col('ra.subject_id') == col('lt.subject_id')) & 
                                     (col('ra.charttime') == col('lt.latest_charttime'))).select(col('ra.subject_id'), col('ra.text')).distinct()

latest_records.show()

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 708, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

In [None]:
num_partitions = latest_records.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

                                                                                

Number of partitions: 10


In [None]:
size_in_bytes = latest_records.rdd.mapPartitions(lambda iterator: [sum(len(x) for x in iterator)]).sum()
print(f"Size in bytes: {size_in_bytes}")
partition_size_bytes = size_in_bytes / num_partitions
print(f"Partition size in bytes: {partition_size_bytes}")

Size in bytes: 32106
Partition size in bytes: 3210.6


                                                                                

In [None]:
latest_records = latest_records.coalesce(1)
num_partitions = latest_records.rdd.getNumPartitions()
size_in_bytes = latest_records.rdd.mapPartitions(lambda iterator: [sum(len(x) for x in iterator)]).sum()
print(f"Size in bytes: {size_in_bytes}")
partition_size_bytes = size_in_bytes / num_partitions
print(f"Partition size in bytes: {partition_size_bytes}")

[Stage 125:>                                                        (0 + 1) / 1]

Size in bytes: 32106
Partition size in bytes: 32106.0


                                                                                

In [None]:
latest_records.toPandas().to_csv('data/radiology_filtered_gold_latest_record.csv', index=False)

## Discharge Notes Processing

In [None]:
# Read in radiology dataset
discharge_schema_list = ast.literal_eval(schemas_df.filter(col("table") == 'discharge').select(col("schema")).collect()[0][0])
discharge_schema = StructType([
    StructField(x, StringType(), True) for x in discharge_schema_list
])

discharge_df = spark.read.option("delimiter", "|").option("quote", '"').option("multiLine", "true").csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/Sagana Outputs/Clinical Notes Creation/Input Data/discharge.csv', schema=discharge_schema)
discharge_df.show(truncate= 80)

+----------+--------+-------------------+--------------------------------------------------------------------------------+
|subject_id| hadm_id|          charttime|                                                                            text|
+----------+--------+-------------------+--------------------------------------------------------------------------------+
|  10000117|27988844|2183-09-21 00:00:00| \\nName:  ___                 Unit No:   ___\\n \\nAdmission Date:  ___     ...|
|  10000117|22927623|2181-11-15 00:00:00| \\nName:  ___                 Unit No:   ___\\n \\nAdmission Date:  ___     ...|
|  10000248|20600184|2192-11-30 00:00:00| \\nName:  ___                      Unit No:   ___\\n \\nAdmission Date:  ___...|
|  10000560|28979390|2189-10-17 00:00:00| \\nName:  ___                     Unit No:   ___\\n \\nAdmission Date:  ___ ...|
|  10000764|27897940|2132-10-19 00:00:00| \\nName:  ___               Unit No:   ___\\n \\nAdmission Date:  ___       ...|
|  10000826|2828

In [None]:
discharge_df_filtered = discharge_df.join(gold_patients, on='subject_id')

# label time stammp
discharge_df_filtered = discharge_df_filtered.withColumn("charttime", col("charttime").cast("timestamp"))

# find the most recent record for each subject_id
latest_times = discharge_df_filtered.groupBy("subject_id").agg(F.max("charttime").alias("latest_charttime"))

# extract the latest radiology note for each patient and text and ensure they are all distinct
latest_records = discharge_df_filtered.alias('ds').join(latest_times.alias('lt'), (col('ds.subject_id') == col('lt.subject_id')) & 
                                     (col('ds.charttime') == col('lt.latest_charttime'))).select(col('ds.subject_id'), col('ds.text')).distinct()

latest_records.show()

[Stage 147:>                                                        (0 + 1) / 1]

+----------+--------------------+
|subject_id|                text|
+----------+--------------------+
|  17294481| \\nName:  ___   ...|
|  18797135| \\nName:  ___   ...|
|  12318550| \\nName:  ___   ...|
|  13132088| \\nName:  ___   ...|
|  14607492| \\nName:  ___   ...|
|  10599039| \\nName:  ___   ...|
|  17476573| \\nName:  ___   ...|
|  18574721| \\nName:  ___   ...|
|  19951664| \\nName:  ___   ...|
|  17002262| \\nName:  ___   ...|
|  15653269| \\nName:  ___   ...|
|  10021704| \\nName:  ___   ...|
|  18836076| \\nName:  ___   ...|
|  14787680| \\nName:  ___   ...|
|  11420248| \\nName:  ___   ...|
|  18413065| \\nName:  ___   ...|
|  18060267| \\nName:  ___   ...|
|  18523038| \\nName:  ___   ...|
|  13258618| \\nName:  ___   ...|
|  15343626| \\nName:  ___   ...|
+----------+--------------------+
only showing top 20 rows



                                                                                

In [None]:
latest_records = latest_records.coalesce(1)

In [None]:
latest_records.toPandas().to_csv('data/discharge_filtered_gold_latest_record.csv', index=False)

                                                                                

25/03/10 19:31:01 WARN DiskBlockObjectWriter: Error deleting /Users/sagana/spark_temp/blockmgr-ee9e25d1-69f8-41f2-b8c5-4a63b3343eae/25/temp_shuffle_b5054783-5434-4bc1-b309-9dc24a61e555
25/03/10 19:31:01 ERROR TaskContextImpl: Error in TaskCompletionListener
org.apache.spark.SparkException: Block broadcast_41 does not exist
	at org.apache.spark.errors.SparkCoreErrors$.blockDoesNotExistError(SparkCoreErrors.scala:318)
	at org.apache.spark.storage.BlockInfoManager.blockInfo(BlockInfoManager.scala:269)
	at org.apache.spark.storage.BlockInfoManager.unlock(BlockInfoManager.scala:390)
	at org.apache.spark.storage.BlockManager.releaseLock(BlockManager.scala:1309)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$releaseBlockManagerLock$1(TorrentBroadcast.scala:319)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$releaseBlockManagerLock$1$adapted(TorrentBroadcast.scala:319)
	at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:137)
	at org.apache.spark.Task

In [None]:
spark.stop()

## Process Notes

In [None]:
rad_df = pd.read_csv('data/radiology_filtered_gold_latest_record.csv')
rad_df.head()

Unnamed: 0,subject_id,text
0,14618137,EXAMINATION: ___ THYROID SCAN\\n\\nINDICATION...
1,13299965,EXAMINATION:\\nChest: Frontal and lateral vie...
2,14776642,"EXAMINATION: FOOT AP,LAT AND OBL RIGHT\\n\\nI..."
3,17179127,EXAMINATION: CHEST (PORTABLE AP)\\n\\nINDICAT...
4,13332476,"EXAMINATION: PATELLA (AP, LAT AND SUNRISE) RI..."


In [None]:
dis_df = pd.read_csv('discharge_filtered_gold_latest_record.csv')
dis_df.head()

Unnamed: 0,subject_id,text
0,17294481,\\nName: ___ Unit No: __...
1,18797135,\\nName: ___ Unit No: ___\...
2,12318550,\\nName: ___ Unit No: __...
3,13132088,\\nName: ___ Unit No: ___\\n \...
4,14607492,\\nName: ___ Unit No: ...


In [None]:
import re

def radiology_cleaning(note):
    headers = ['FINDINGS:', 'IMPRESSION:', 'TECHNIQUE:', 'INDICATION:', 'COMPARISON:']
    for header in headers:
        note = re.sub(r'(?i)' + re.escape(header), '', note)
        
    note = re.sub(r'\n+', ' ', note)
    note = re.sub(r'[^\w\s\.,;:\-\(\)\[\]+]', ' ', note)
    note = re.sub(r'\s+', ' ', note).strip()
    
    note = note.lower()
    return note

rad_df['cleaned_text'] = rad_df['text'].apply(radiology_cleaning)
rad_df.head()

Unnamed: 0,subject_id,text,cleaned_text
0,14618137,EXAMINATION: ___ THYROID SCAN\\n\\nINDICATION...,examination: ___ thyroid scan ___ year old man...
1,13299965,EXAMINATION:\\nChest: Frontal and lateral vie...,examination: chest: frontal and lateral views ...
2,14776642,"EXAMINATION: FOOT AP,LAT AND OBL RIGHT\\n\\nI...","examination: foot ap,lat and obl right ___ yea..."
3,17179127,EXAMINATION: CHEST (PORTABLE AP)\\n\\nINDICAT...,examination: chest (portable ap) ___ year old ...
4,13332476,"EXAMINATION: PATELLA (AP, LAT AND SUNRISE) RI...","examination: patella (ap, lat and sunrise) rig..."


In [None]:
def discharge_clean(note):
    note = re.sub(r'(?im)^(Admission Date:|Discharge Data:|Patient:).*$', '', note)
    note = re.sub(r'(?im)Page\s+\d+\s+of\s+\d+', '', note)
    note = re.sub(r'(?im)^(Signature:|Doctor:|Nurse:).*$', '', note)
    note = re.sub(r'\s+', ' ', note).strip()
    
    # Remove escape sequences and normalize whitespace.
    cleaned = re.sub(r"\\n", " ", note)
    cleaned = re.sub(r"\\+", " ", cleaned)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    note = note.lower()
    return note 

dis_df['cleaned_text'] = dis_df['text'].apply(discharge_clean)
dis_df.head()

Unnamed: 0,subject_id,text,cleaned_text
0,17294481,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...
1,18797135,\\nName: ___ Unit No: ___\...,\ name: ___ unit no: ___\ \ \ date of birth: _...
2,12318550,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...
3,13132088,\\nName: ___ Unit No: ___\\n \...,\ name: ___ unit no: ___\ \ \ date of birth: _...
4,14607492,\\nName: ___ Unit No: ...,\ name: ___ unit no: ___\ \ \ date of birth: _...


## Run Radiology Specific Model Embedding

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set device: use MPS on macOS if available; otherwise, use CPU.
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("StanfordAIMI/RadBERT")
model = AutoModel.from_pretrained("StanfordAIMI/RadBERT").to(device)
model.eval()
if device != "cpu":  # Use FP16 only on GPU/MPS devices
    model.half()

def get_bert_embedding(text):
    """
    Generates an embedding for a single text using RadBERT.
    
    - Tokenizes the text with a maximum length of 512.
    - Uses the CLS token (index 0) from the last hidden state.
    - Applies torch.no_grad() for inference speed.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    # Move all tensors to the selected device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the CLS token embedding, convert to FP16 and then to a NumPy array
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().to(torch.float16).cpu().numpy()
    return cls_embedding

def get_embeddings_for_texts(texts, max_workers=4):
    """
    Processes a list of texts concurrently using ThreadPoolExecutor.
    Returns a list of embeddings with a tqdm progress bar.
    """
    embeddings = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_bert_embedding, text): idx for idx, text in enumerate(texts)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing embeddings"):
            embeddings.append(future.result())
    return embeddings

def process_dataframe(df, text_column, max_workers=4):
    """
    Applies get_bert_embedding to a Pandas DataFrame column using multi-threading.
    Adds a new column 'embeddings' with the computed embeddings.
    """
    texts = df[text_column].tolist()
    embeddings = get_embeddings_for_texts(texts, max_workers=max_workers)
    df["embeddings"] = embeddings
    return df

using device: mps


In [None]:
rad_df = process_dataframe(rad_df, 'cleaned_text', max_workers=4)
rad_df.head()

Processing Text Embeddings:   0%|          | 0/16053 [08:51<?, ?it/s]
Processing embeddings: 100%|██████████| 16053/16053 [06:19<00:00, 42.26it/s]


Unnamed: 0,subject_id,text,cleaned_text,embeddings
0,14618137,EXAMINATION: ___ THYROID SCAN\\n\\nINDICATION...,examination: ___ thyroid scan ___ year old man...,"[0.784, -0.2866, 0.5713, 0.0351, 0.10425, -0.7..."
1,13299965,EXAMINATION:\\nChest: Frontal and lateral vie...,examination: chest: frontal and lateral views ...,"[1.279, -0.6196, -0.3943, -0.7603, -0.4463, -0..."
2,14776642,"EXAMINATION: FOOT AP,LAT AND OBL RIGHT\\n\\nI...","examination: foot ap,lat and obl right ___ yea...","[1.658, -0.6665, 0.4915, 0.721, -0.2544, -0.32..."
3,17179127,EXAMINATION: CHEST (PORTABLE AP)\\n\\nINDICAT...,examination: chest (portable ap) ___ year old ...,"[1.531, -0.2247, 0.3254, 0.6353, -0.1626, 0.52..."
4,13332476,"EXAMINATION: PATELLA (AP, LAT AND SUNRISE) RI...","examination: patella (ap, lat and sunrise) rig...","[0.786, -0.771, -0.1842, -0.01572, 0.6724, -0...."


In [None]:
rad_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16053 entries, 0 to 16052
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subject_id    16053 non-null  int64 
 1   text          16053 non-null  object
 2   cleaned_text  16053 non-null  object
 3   embeddings    16053 non-null  object
dtypes: int64(1), object(3)
memory usage: 501.8+ KB


In [None]:
rad_df.to_csv('data/radiology_with_embeddings.csv', index=False)

## Run Discharge Specific Model Embedding

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set device: use MPS on macOS if available; otherwise, use CPU.
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT").to(device)
model.eval()
if device != "cpu":  # Use FP16 only on GPU/MPS devices
    model.half()

def get_bert_embedding(text):
    """
    Generates an embedding for a single text using RadBERT.
    
    - Tokenizes the text with a maximum length of 512.
    - Uses the CLS token (index 0) from the last hidden state.
    - Applies torch.no_grad() for inference speed.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    # Move all tensors to the selected device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the CLS token embedding, convert to FP16 and then to a NumPy array
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().to(torch.float16).cpu().numpy()
    return cls_embedding

def get_embeddings_for_texts(texts, max_workers=4):
    """
    Processes a list of texts concurrently using ThreadPoolExecutor.
    Returns a list of embeddings with a tqdm progress bar.
    """
    embeddings = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_bert_embedding, text): idx for idx, text in enumerate(texts)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing embeddings"):
            embeddings.append(future.result())
    return embeddings

def process_dataframe(df, text_column, max_workers=4):
    """
    Applies get_bert_embedding to a Pandas DataFrame column using multi-threading.
    Adds a new column 'embeddings' with the computed embeddings.
    """
    texts = df[text_column].tolist()
    embeddings = get_embeddings_for_texts(texts, max_workers=max_workers)
    df["embeddings"] = embeddings
    return df

Using device: mps


In [None]:
dis_df.head()

Unnamed: 0,subject_id,text,cleaned_text
0,17294481,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...
1,18797135,\\nName: ___ Unit No: ___\...,\ name: ___ unit no: ___\ \ \ date of birth: _...
2,12318550,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...
3,13132088,\\nName: ___ Unit No: ___\\n \...,\ name: ___ unit no: ___\ \ \ date of birth: _...
4,14607492,\\nName: ___ Unit No: ...,\ name: ___ unit no: ___\ \ \ date of birth: _...


In [None]:
dis_df = process_dataframe(dis_df, 'cleaned_text', max_workers=4)
dis_df.head()

Processing embeddings: 100%|██████████| 14335/14335 [08:12<00:00, 29.08it/s]


Unnamed: 0,subject_id,text,cleaned_text,embeddings
0,17294481,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...,"[-0.3994, 0.1586, -0.2698, -0.1289, -0.07196, ..."
1,18797135,\\nName: ___ Unit No: ___\...,\ name: ___ unit no: ___\ \ \ date of birth: _...,"[0.1487, 0.2366, -0.372, -0.07855, -0.1487, -0..."
2,12318550,\\nName: ___ Unit No: __...,\ name: ___ unit no: ___\ \ \ date of birth: _...,"[-0.2405, 0.3093, -0.4531, -0.02596, -0.001131..."
3,13132088,\\nName: ___ Unit No: ___\\n \...,\ name: ___ unit no: ___\ \ \ date of birth: _...,"[-0.092, 0.4846, -0.297, 0.00444, -0.10333, -0..."
4,14607492,\\nName: ___ Unit No: ...,\ name: ___ unit no: ___\ \ \ date of birth: _...,"[-0.5337, 0.2896, -0.2017, 0.002855, -0.03348,..."


In [None]:
dis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14335 entries, 0 to 14334
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subject_id    14335 non-null  int64 
 1   text          14335 non-null  object
 2   cleaned_text  14335 non-null  object
 3   embeddings    14335 non-null  object
dtypes: int64(1), object(3)
memory usage: 448.1+ KB


In [None]:
dis_df.to_csv('discharge_with_embeddings.csv', index=False)