In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installation and Importing

In [None]:
!pip install pytorch_tabnet

Collecting faker
  Downloading Faker-24.8.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-24.8.0


In [None]:
import torch
import pickle
import pandas as pd
import random
import os
import numpy as np
import multiprocessing
import statistics
import matplotlib.pyplot as plt

# DATA PREPROCESSING

In [None]:
#taking latest data from storage
latest_data = os.listdir('/content/drive/MyDrive/mimic-iii-clinical-database-1.4/Merged_files/')[-8]
latest_data = 'adm_pat_cpt_icu_note_v2.csv'

In [None]:
data = pd.read_csv(f'/content/drive/MyDrive/mimic-iii-clinical-database-1.4/Merged_files/{latest_data}')
data.drop(columns=['CPT_CD','RUNTIME'], inplace=True) # removing irrelevant columns
# rearranging columns in practical order
data = data[['HADM_ID', 'SUBJECT_ID','GENDER', 'AGE','RELIGION', 'ETHNICITY','ADMISSION_TYPE', 'ADMISSION_LOCATION',
        'DIAGNOSIS', 'PROCEDURE','CATEGORY', 'DESCRIPTION', 'TEXT','DISCHARGE_LOCATION','HOSPITAL_EXPIRE_FLAG','LOS']]
data.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,GENDER,AGE,RELIGION,ETHNICITY,ADMISSION_TYPE,ADMISSION_LOCATION,DIAGNOSIS,PROCEDURE,CATEGORY,DESCRIPTION,TEXT,DISCHARGE_LOCATION,HOSPITAL_EXPIRE_FLAG,LOS
0,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...",Echo,Report,PATIENT/TEST INFORMATION:\nIndication: Abnorma...,home health care,0,0.705405
1,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...",ECG,Report,Sinus bradycardia. Left atrial abnormality. ...,home health care,0,0.705405
2,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...",Radiology,CHEST (PA & LAT),[**2162-5-16**] 7:23 PM\n CHEST (PA & LAT) ...,home health care,0,0.705405
3,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...",Radiology,VEN DUP EXTEXT BIL (MAP/DVT),[**2162-5-17**] 8:12 AM\n [**Last Name (un) 12...,home health care,0,0.705405
4,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...",Radiology,CHEST PORT. LINE PLACEMENT,[**2162-5-17**] 6:41 PM\n CHEST PORT. LINE PLA...,home health care,0,0.705405


### Aggregating

In [None]:
df = data.groupby('HADM_ID').agg({
    'SUBJECT_ID': 'first',
    'GENDER': 'first',
    'AGE': 'first',
    'RELIGION': 'first',
    'ETHNICITY': 'first',
    'ADMISSION_TYPE': 'first',
    'ADMISSION_LOCATION': 'first',
    'DIAGNOSIS': 'first',
    'PROCEDURE': 'first',
    'CATEGORY': ', '.join,
    'DESCRIPTION': ', '.join,
    'TEXT': ', '.join,
    'DISCHARGE_LOCATION': 'first',
    'HOSPITAL_EXPIRE_FLAG': 'first',
    'LOS': 'first',
}).reset_index()
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,GENDER,AGE,RELIGION,ETHNICITY,ADMISSION_TYPE,ADMISSION_LOCATION,DIAGNOSIS,PROCEDURE,CATEGORY,DESCRIPTION,TEXT,DISCHARGE_LOCATION,HOSPITAL_EXPIRE_FLAG,LOS
0,100009,533,male,61,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,h...","Echo, ECG, Radiology, Radiology, Radiology","Report, Report, CHEST (PA & LAT), VEN DUP EXTE...",PATIENT/TEST INFORMATION:\nIndication: Abnorma...,home health care,0,0.705405
1,100011,87977,male,21,not specified,hispanic or latino,emergency,clinic referral/premature,motor vehicle accident with unspecified injuries,"hospital inpatient services,consultations,inte...","Radiology, Radiology, Radiology, Radiology, Ra...","MR CERVICAL SPINE W/O CONTRAST, MR BRACHIAL PL...","[**Last Name (LF) 231**],[**First Name3 (LF) 2...",rehab/distinct part hosp,0,4.318437
2,100012,60039,male,68,catholic,white,emergency,transfer from hosp/extram,coronary artery disease,"cardiovascular system,critical care services,p...","Echo, Echo, Echo, ECG, ECG, ECG, ECG, Radiolog...","Report, Report, Report, Report, Report, Report...",PATIENT/TEST INFORMATION:\nIndication: H/O car...,home health care,0,1.072373
3,100016,68591,male,56,protestant quaker,white,emergency,clinic referral/premature,pneumonia,"critical care services,respiratory system,hosp...","ECG, Radiology, Radiology, Radiology, Radiolog...","Report, G/GJ/GI TUBE CHECK, CTA CHEST W&W/O C&...","Sinus tachycardia. Otherwise, no significant ...",snf,0,1.851157
4,100018,58128,male,55,protestant quaker,white,elective,phys referral/normal deli,herniated disc/sda,"hospital inpatient services,musculoskeletal sy...","Nursing, Physician , Nursing, Nursing, Respira...","Nursing Progress Note, Intensivist Note, Nursi...",Impaired Physical Mobility\n Assessment:\n ...,long term care hospital,0,2.215475


### Precised Pre Processing

In [None]:
df = df[df['ADMISSION_LOCATION'] != '** info not available **'] #only one
df.loc[(df['RELIGION'] == 'unobtainable') | (df['RELIGION'] == 'other'), 'RELIGION'] = 'not specified'
df.loc[(df['ETHNICITY'] == 'other') | (df['ETHNICITY'] == 'unknown/not specified') | (df['ETHNICITY'] == 'unable to obtain') | (df['ETHNICITY'] == 'patient declined to answer'), 'ETHNICITY'] = 'not specified'

In [None]:
len(df)

15836

# Prompt Construction Module

In [None]:
import string
import re
def preprocess_text(text):
    text = re.sub(r'\n+', ' ', text)
    text = text.replace('*', '')
    text = text.replace(',', ' ')
    text = re.sub(r'>+', ' greater than ', text)
    text = re.sub(r'<+', ' less than ', text)
    text = re.sub(r'=+', ' equal to ', text)
    text = text.strip()
    text = text.lower()
    return text

In [None]:
import random

#Admission location templates
ALT= [
    f"This patient was transferred from the hospital or extramural location.",
    f"This patient was referred from a clinic due to premature circumstances.",
    f"This patient was referred by a physician following a normal delivery.",
    f"This patient was admitted via the emergency room.",
    f"This patient was transferred from a skilled nursing facility.",
    f"This patient was transferred from another healthcare facility."
]

#Discharge location templates
DLT= [
    f"After receiving treatment, this patient was discharged to home health care.",
    f"This patient was discharged from the hospital to a rehabilitation or distinct part hospital.",
    f"This patient was discharged to a skilled nursing facility.",
    f"After treatment, this patient was discharged to a long-term care hospital.",
    f"After treatment, this patient returned home.",
    f"Unfortunately, this patient has passed away or expired.",
    f"This patient was discharged and transferred to a psychiatric hospital.",
    f"After treatment, this patient was discharged from a short-term hospital.",
    f"This patient left against medical advice.",
    f"After treatment, this patient was discharged to hospice care at home.",
    f"After treatment, this patient was transferred to a hospice medical facility.",
    f"This patient was discharged and transferred to a facility specializing in cancer or children's health.",
    f"This patient was discharged and transferred to another facility.",
    f"This patient was discharged and transferred to a federal healthcare facility.",
    f"After treatment, this patient was discharged and received home intravenous care.",
    f"This patient was discharged to a skilled nursing facility with Medicaid-only certification.",
    f"This patient was discharged to an intermediate care facility."
]

def generate_sentences(dataframe):
    sentences = []
    for index, row in dataframe.iterrows():
        row_sentences = []
        for column in dataframe.columns:
            template_sentences = {
                'HADM_ID': [
                    f"The unique Hospital Admission ID assigned to this patient is {row[column]}.",
                    f"This patient's Hospital Admission ID is {row[column]}.",
                    f"The Hospital Admission ID recorded for this patient is {row[column]}.",
                    f"The ID for this hospital admission of the patient is {row[column]}.",
                    f"This patient has a Hospital Admission ID of {row[column]}."
                ],
                'SUBJECT_ID': [
                    f"The Subject ID associated with this patient is {row[column]}.",
                    f"This patient's Subject ID is {row[column]}.",
                    f"The Subject ID recorded for this patient is {row[column]}.",
                    f"This patient is linked to the Subject ID {row[column]}.",
                    f"The Subject ID for this patient is {row[column]}."
                ],
                'ADMISSION_TYPE': [
                    f"The admission type for this patient is {row[column]}.",
                    f"This patient's admission type is {row[column]}.",
                    f"The admission was categorized as {row[column]} for this patient.",
                    f"This patient falls under the admission type of {row[column]}.",
                    f"For this patient, the admission type is {row[column]}."
                ],
                'RELIGION': [
                    f"The religion of this patient is {row[column]}.",
                    f"This patient's religious affiliation is {row[column]}.",
                    f"The religion recorded for this patient is {row[column]}.",
                    f"This patient practices {row[column]} religion.",
                    f"This patient follows {row[column]} religious beliefs."
                ],
                'ETHNICITY': [
                    f"The ethnicity of this patient is {row[column]}.",
                    f"This patient belongs to the {row[column]} ethnicity.",
                    f"This patient is classified as {row[column]} ethnicity.",
                    f"The recorded ethnicity for this patient is {row[column]}.",
                    f"This patient identifies with {row[column]} ethnicity."
                ],
                'DIAGNOSIS': [
                    f"The diagnosis for this patient is {row[column]}.",
                    f"This patient's diagnosis is {row[column]}.",
                    f"The medical condition of this patient is {row[column]}.",
                    f"This patient was diagnosed with {row[column]}.",
                    f"The recorded diagnosis for this patient is {row[column]}."
                ],
                'GENDER': [
                    f"This patient's gender is {row[column]}.",
                    f"The gender of this patient is {row[column]}.",
                    f"The recorded gender for this patient is {row[column]}.",
                    f"This patient identifies as {row[column]}.",
                    f"This patient is {row[column]} gender."
                ],
                'AGE': [
                    f"The age of this patient is {row[column]}.",
                    f"This patient's age is {row[column]}.",
                    f"The recorded age for this patient is {row[column]}.",
                    f"For this patient, the age is {row[column]}.",
                    f"This patient is {row[column]} years old."
                ],
                'PROCEDURE': [
                    f"This patient underwent {row[column]} procedures.",
                    f"The procedures performed on this patient is {row[column]}.",
                    f"This patient's procedures during hospital stay was {row[column]}.",
                    f"The recorded procedures for this patient is {row[column]}.",
                    f"For this patient, the procedures conducted was {row[column]}."
                ],
                'CATEGORY': [
                    f"The category of diagnosis or procedure for this patient is {row[column]}.",
                    f"This patient's diagnosis or procedure falls under the category of {row[column]}.",
                    f"This patient's diagnosis or procedure is classified as {row[column]} category.",
                    f"The recorded category for this patient's diagnosis or procedure is {row[column]}.",
                    f"For this patient, the category of diagnosis or procedure is {row[column]}."
                ],
                'DESCRIPTION': [
                    f"The description for this patient's diagnosis or procedure is {row[column]}.",
                    f"This patient's diagnosis or procedure is described as {row[column]}.",
                    f"This patient's diagnosis or procedure has a description of {row[column]}.",
                    f"The recorded description for this patient's diagnosis or procedure is {row[column]}.",
                    f"For this patient, the description of diagnosis or procedure is {row[column]}."
                ],
                'TEXT': [
                    f"Additional notes or text associated with this patient's admission entry include {row[column]}.",
                    f"This patient's admission entry includes additional notes or text such as {row[column]}.",
                    f"Textual information related to this patient's admission entry consists of {row[column]}.",
                    f"This patient's admission entry contains additional text or notes, including {row[column]}.",
                    f"For this patient, the admission entry includes text such as {row[column]}."
                ]
            }
            if column == 'ADMISSION_LOCATION':
                random_sentence = ALT[np.where(df['ADMISSION_LOCATION'].unique() == row[column])[0][0]]
            elif column == 'DISCHARGE_LOCATION':
                random_sentence = DLT[np.where(df['DISCHARGE_LOCATION'].unique() == row[column])[0][0]]
            elif column == 'RELIGION':
                if row[column] == 'not specified':
                    random_sentence = preprocess_text(random.choice(template_sentences.get(column, ["No template available"])[:2]))
                else:
                    random_sentence = preprocess_text(random.choice(template_sentences.get(column, ["No template available"])))
            elif column == 'ETHNICITY':
                if row[column] == 'not specified':
                    random_sentence = preprocess_text(random.choice(template_sentences.get(column, ["No template available"])[:1]))
                else:
                    random_sentence = preprocess_text(random.choice(template_sentences.get(column, ["No template available"])))
            elif column == 'LOS' or column == 'HOSPITAL_EXPIRE_FLAG':
                random_sentence = row[column]
            else:
                random_sentence = preprocess_text(random.choice(template_sentences.get(column, ["No template available"])))
            row_sentences.append(random_sentence)
        sentences.append(row_sentences)
    return sentences

In [None]:
sentences_df = pd.DataFrame(generate_sentences(df), columns=df.columns)
sentences_df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,GENDER,AGE,RELIGION,ETHNICITY,ADMISSION_TYPE,ADMISSION_LOCATION,DIAGNOSIS,PROCEDURE,CATEGORY,DESCRIPTION,TEXT,DISCHARGE_LOCATION,HOSPITAL_EXPIRE_FLAG,LOS
0,this patient has a hospital admission id of 10...,the subject id associated with this patient is...,this patient's gender is male.,the age of this patient is 61.,the religion recorded for this patient is cath...,the ethnicity of this patient is white.,this patient falls under the admission type of...,This patient was transferred from the hospital...,the medical condition of this patient is coron...,this patient underwent cardiovascular system c...,the category of diagnosis or procedure for thi...,this patient's diagnosis or procedure has a de...,for this patient the admission entry includes...,"After receiving treatment, this patient was di...",0,0.705405
1,this patient has a hospital admission id of 10...,this patient is linked to the subject id 87977.,the gender of this patient is male.,this patient is 21 years old.,this patient's religious affiliation is not sp...,the ethnicity of this patient is hispanic or l...,the admission type for this patient is emergency.,This patient was referred from a clinic due to...,this patient's diagnosis is motor vehicle acci...,the recorded procedures for this patient is ho...,this patient's diagnosis or procedure is class...,for this patient the description of diagnosis...,this patient's admission entry contains additi...,This patient was discharged from the hospital ...,0,4.318437
2,this patient's hospital admission id is 100012.,the subject id recorded for this patient is 60...,this patient identifies as male.,the recorded age for this patient is 68.,this patient follows catholic religious beliefs.,the recorded ethnicity for this patient is white.,this patient's admission type is emergency.,This patient was transferred from the hospital...,this patient's diagnosis is coronary artery di...,this patient's procedures during hospital stay...,for this patient the category of diagnosis or...,the description for this patient's diagnosis o...,this patient's admission entry includes additi...,"After receiving treatment, this patient was di...",0,1.072373
3,this patient's hospital admission id is 100016.,the subject id recorded for this patient is 68...,the gender of this patient is male.,this patient's age is 56.,this patient practices protestant quaker relig...,this patient is classified as white ethnicity.,this patient's admission type is emergency.,This patient was referred from a clinic due to...,the diagnosis for this patient is pneumonia.,this patient underwent critical care services ...,this patient's diagnosis or procedure falls un...,this patient's diagnosis or procedure is descr...,additional notes or text associated with this ...,This patient was discharged to a skilled nursi...,0,1.851157
4,the unique hospital admission id assigned to t...,this patient's subject id is 58128.,this patient identifies as male.,the age of this patient is 55.,the religion of this patient is protestant qua...,the ethnicity of this patient is white.,for this patient the admission type is elective.,This patient was referred by a physician follo...,this patient's diagnosis is herniated disc/sda.,for this patient the procedures conducted was...,for this patient the category of diagnosis or...,this patient's diagnosis or procedure has a de...,textual information related to this patient's ...,"After treatment, this patient was discharged t...",0,2.215475


In [None]:
dir = '/content/drive/MyDrive/mimic-iii-clinical-database-1.4/Merged_files/sentences.csv'
sentences_df.to_csv(dir, index=False)