# Personal Notes

On macbook pro
- Using arm64 architecture is faster than x86_64 for pyspark jobs: `arch -arm64 /bin/zsh`
  - Verify: `uname -m`
- 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

def start_spark():
    spark = (SparkSession.builder 
        .appName("OptimizedPySpark") 
        .master("local") 
        .config("spark.driver.memory", "8g") 
        .config("spark.executor.memory", "6g")
        .config("spark.driver.maxResultSize", "2g") 
        .config("spark.sql.shuffle.partitions", "100")  
        .config("spark.default.parallelism", "8") 
        .config("spark.local.dir", "/tmp/spark-temp")
        .config("spark.rdd.compress", "true") 
        .config("spark.memory.fraction", "0.8") 
        .config("spark.memory.storageFraction", "0.3") 
        .getOrCreate())

    # Reduce shuffle partitions
    spark.conf.set("spark.sql.shuffle.partitions", "50")

    print("Spark Version:", spark.version)
    print("Spark UI: http://localhost:4040")
    return spark

In [None]:
def clean_resources(spark):
    spark.stop()
    return start_spark()

In [5]:
spark = start_spark()

25/03/06 14:49:49 WARN Utils: Your hostname, Saganas-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.69 instead (on interface en0)
25/03/06 14:49:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/06 14:49:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/06 14:49:49 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


Spark Version: 3.5.4
Spark UI: http://localhost:4040


In [6]:
import os

GOOGLE_DRIVE_LOCAL_MOUNT='/Users/sagana/Library/CloudStorage/GoogleDrive-sondande@uchicago.edu/.shortcut-targets-by-id/1O2pwlZERv3B7ki78Wn0brrpnArRBTFdH/MLI_2025 Winter/'

# Check if Google Drive is accessible
if os.path.exists(GOOGLE_DRIVE_LOCAL_MOUNT):
    print("Google Drive is mounted successfully!")
    print("Files in Drive:", os.listdir(GOOGLE_DRIVE_LOCAL_MOUNT))
else:
    print("Google Drive is not mounted. Please check your installation.")

Google Drive is mounted successfully!
Files in Drive: ['(ReferHere)Final_Dataset_Data_Folder ', 'merged_5000_patient_radio.csv', 'mimic-iv-ext-clinical-decision-making-a-mimic-iv-derived-dataset-for-evaluation-of-large-language-models-on-the-task-of-clinical-decision-making-for-abdominal-pathologies-1.1.zip', '.DS_Store', 'extracted_zip', 'JM outputs', 'SQL DB Export', 'mimiciv.db', 'mimic-iv-3.1.zip', 'Machine Learning I Team 5 Project Proposal.gdoc', 'YY_codes', 'mimic-iv-note-deidentified-free-text-clinical-notes-2.2.zip', 'merged_5000_patient.csv', 'Project Idea.gdoc', 'Final_Dataset_Data_Folder_unzip', 'MLI_2025_Winter', 'Sagana Outputs', 'merged_5000_patient_radio_disc.csv', 'Project Milestone-I.gdoc', 'Dataset Readme.gdoc']


In [7]:
from pyspark.sql.types import *
from pyspark.sql.functions import collect_set, collect_list, struct, col, when, count, countDistinct, lit
import pandas as pd
import ast

# Read in schema file and process to get schemas needed
schemas_df = spark.read.csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/SQL DB Export/CSV/schema.csv', header=True)
schemas_df.show(5)

+----------------+--------------------+
|           table|              schema|
+----------------+--------------------+
|   diagnoses_icd|['subject_id', 'h...|
|       discharge|['subject_id', 'h...|
|        drgcodes|['subject_id', 'h...|
| d_icd_diagnoses|['icd_code', 'icd...|
|d_icd_procedures|['icd_code', 'icd...|
+----------------+--------------------+
only showing top 5 rows



In [8]:
# Construct schema
radiology_schema_list = ast.literal_eval(schemas_df.filter(col("table") == 'radiology').select(col("schema")).collect()[0][0])
radiology_schema = StructType([
    StructField(x, StringType(), True) for x in radiology_schema_list
])

# Read in radiology dataset
radiology_df = spark.read.option("delimiter", "|").option("quote", '"').option("multiLine", "true").csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/Sagana Outputs/Clinical Notes Creation/Input Data/radiology.csv', schema=radiology_schema)
radiology_df.show(truncate= 80)

+----------+--------+-------------------+--------------------------------------------------------------------------------+
|subject_id| hadm_id|          charttime|                                                                            text|
+----------+--------+-------------------+--------------------------------------------------------------------------------+
|  10000117|    NULL|2175-05-10 10:12:00|BILATERAL DIGITAL SCREENING MAMMOGRAM WITH CAD\\n\\nHISTORY:  Baseline screen...|
|  10000117|    NULL|2177-05-23 13:18:00|INDICATION:  ___ female with right epigastric pain radiating to back,\\nrule ...|
|  10000117|    NULL|2178-08-29 13:39:00|CLINICAL HISTORY:  Right upper quadrant pain, evaluate for gallstones.\\n\\nA...|
|  10000117|22927623|2181-11-15 00:40:00|EXAMINATION:   CHEST (PA AND LAT)\\n\\nINDICATION:  History: ___ with PMH GER...|
|  10000117|22927623|2181-11-15 00:47:00|EXAMINATION:   NECK SOFT TISSUES\\n\\nINDICATION:  ___ woman with dysphasia. ...|
|  10000117|    

In [9]:
# Read in radiology dataset
discharge_schema_list = ast.literal_eval(schemas_df.filter(col("table") == 'discharge').select(col("schema")).collect()[0][0])
discharge_schema = StructType([
    StructField(x, StringType(), True) for x in discharge_schema_list
])

discharge_df = spark.read.option("delimiter", "|").option("quote", '"').option("multiLine", "true").csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/Sagana Outputs/Clinical Notes Creation/Input Data/discharge.csv', schema=discharge_schema)
discharge_df.show(truncate= 80)

+----------+--------+-------------------+--------------------------------------------------------------------------------+
|subject_id| hadm_id|          charttime|                                                                            text|
+----------+--------+-------------------+--------------------------------------------------------------------------------+
|  10000117|27988844|2183-09-21 00:00:00| \\nName:  ___                 Unit No:   ___\\n \\nAdmission Date:  ___     ...|
|  10000117|22927623|2181-11-15 00:00:00| \\nName:  ___                 Unit No:   ___\\n \\nAdmission Date:  ___     ...|
|  10000248|20600184|2192-11-30 00:00:00| \\nName:  ___                      Unit No:   ___\\n \\nAdmission Date:  ___...|
|  10000560|28979390|2189-10-17 00:00:00| \\nName:  ___                     Unit No:   ___\\n \\nAdmission Date:  ___ ...|
|  10000764|27897940|2132-10-19 00:00:00| \\nName:  ___               Unit No:   ___\\n \\nAdmission Date:  ___       ...|
|  10000826|2828

In [11]:
# Select only required fields
radiology_df_filtered = radiology_df.select('subject_id', 'text')
discharge_df_filtered = discharge_df.select('subject_id', 'text')

In [29]:
# Filter for only notes where we have a patient to ensure we filter down datasets
patients_df = spark.read.csv(f'{GOOGLE_DRIVE_LOCAL_MOUNT}/JM outputs/patients_cleaned.csv', header=True)
patients_df.show(5)

+----------+------+----------+-----------+---------+--------+--------------+-----+-----------------------+------------------------+----+------+------+----+
|subject_id|gender|anchor_age|anchor_year|insurance|language|marital_status| race|blood_pressure_systolic|blood_pressure_diastolic| bmi|height|weight|egfr|
+----------+------+----------+-----------+---------+--------+--------------+-----+-----------------------+------------------------+----+------+------+----+
|  10000117|     F|        48|       2174| Medicaid| English|      DIVORCED|WHITE|                    108|                      74|18.9|    64|   110|NULL|
|  10000161|     M|        60|       2163| Medicaid| English|        SINGLE|WHITE|                    106|                      92|NULL|  NULL|  NULL|NULL|
|  10000248|     M|        34|       2192|  Private| English|       MARRIED|WHITE|                   NULL|                    NULL|25.5|    68|   168|NULL|
|  10000280|     M|        20|       2151|  Private| English|   

                                                                                

In [41]:
final_radiology_df = radiology_df_filtered.join(patients_df, radiology_df_filtered.subject_id == patients_df.subject_id, 'left_semi')

In [42]:
final_discharge_df = discharge_df_filtered.join(patients_df, discharge_df_filtered.subject_id == patients_df.subject_id, 'left_semi')

In [47]:
final_radiology_df.write.mode("overwrite").option("compression", "snappy").parquet('radiology_filtered/')

                                                                                

In [48]:
final_discharge_df.write.mode("overwrite").option("compression", "snappy").parquet('discharge_filtered/')

                                                                                

## Review Processed datasets

In [62]:
discharge_processed_df = spark.read.parquet('discharge_processed/')

In [63]:
radio_processed_df = spark.read.parquet('radiology_processed/')

In [77]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import to_json, col

df = discharge_processed_df.withColumn("sections", to_json(col("sections")))  # Convert map column to JSON string
df = df.withColumn("entities", to_json(col("entities")))
# df.write.csv("output_directory", header=True, mode="overwrite")
# df.coalesce(1).write.csv("discharge_processed_csv/", header=True, mode="overwrite")
df_PD = df.toPandas()

                                                                                

In [78]:
df_PD.to_csv('discharge_processed_csv/discharge_processed.csv', index=False)

In [79]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import to_json, col

df = radio_processed_df.withColumn("sections", to_json(col("sections")))  # Convert map column to JSON string
df = df.withColumn("entities", to_json(col("entities")))
# df.write.csv("output_directory", header=True, mode="overwrite")
# df.coalesce(1).write.csv("discharge_processed_csv/", header=True, mode="overwrite")
df_PD = df.toPandas()
df_PD.to_csv('radiology_processed_csv/radio_processed.csv', index=False)

                                                                                

In [80]:
df_PD.head()

Unnamed: 0,subject_id,text,cleaned_text,sections,entities
0,10000117,BILATERAL DIGITAL SCREENING MAMMOGRAM WITH CAD...,BILATERAL DIGITAL SCREENING MAMMOGRAM WITH CAD...,"{""bilateral_digital_screening_mammogram_with_c...","{""birads"":[""BI-RADS 1"",""bi-rads 1"",""bi-rads""],..."
1,10000117,INDICATION: ___ female with right epigastric ...,INDICATION: ___ female with right epigastric p...,"{""impression"":""1. No gallstones and no signs o...","{""birads"":[],""locations"":[],""procedures"":[],""f..."
2,10000117,"CLINICAL HISTORY: Right upper quadrant pain, ...","CLINICAL HISTORY: Right upper quadrant pain, e...","{""impression"":""Normal gallbladder. No gallston...","{""birads"":[],""locations"":[""quadrant""],""procedu..."
3,10000117,EXAMINATION: CHEST (PA AND LAT)\\n\\nINDICAT...,EXAMINATION: CHEST (PA AND LAT) INDICATION: Hi...,"{""comparison"":""Chest radiograph from ___."",""ex...","{""birads"":[],""locations"":[],""procedures"":[],""f..."
4,10000117,EXAMINATION: NECK SOFT TISSUES\\n\\nINDICATI...,EXAMINATION: NECK SOFT TISSUES INDICATION: ___...,"{""comparison"":""None available."",""examination"":...","{""birads"":[],""locations"":[],""procedures"":[],""f..."
