In [1]:
import pandas as pd
from IPython.display import display, HTML
import re

In [2]:
df = pd.read_csv('../data/discharge.csv1.csv')

In [3]:
text_column = df['text']

Working on the one row sample data

In [4]:
# Accessing the first row's text data
first_row_text = df['text'].iloc[0]

# Print the full text data from the first row
print(first_row_text)

 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Worsening ABD distension and pain 
 
Major Surgical or Invasive Procedure:
Paracentesis

 
History of Present Illness:
___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, 
bioplar, PTSD, presented from OSH ED with worsening abd 
distension over past week.  
Pt reports self-discontinuing lasix and spirnolactone ___ weeks 
ago, because she feels like "they don't do anything" and that 
she "doesn't want to put more chemicals in her." She does not 
follow Na-restricted diets. In the past week, she notes that she 
has been having worsening abd distension and discomfort. She 
denies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, 
dysuria. She had food poisoning a week ago from eating stale 
cake (n/v 20 min after fo

Extracting the text column data to sub-sections columns

In [5]:
patterns = {
    "Chief Complaint": r"Chief Complaint:\s*(.+?)(?=Major Surgical|History of Present Illness|$)",
    "Major Surgical or Invasive Procedure": r"Major Surgical or Invasive Procedure:\s*(.+?)(?=History of Present Illness|$)",
    "Discharge Diagnosis": r"Discharge Diagnosis:\s*(.+?)(?=Discharge Condition|$)",
    "Discharge Medications": r"Discharge Medications:\s*(.+?)(?=Discharge Diagnosis|$)",
    "Discharge Condition": r"Discharge Condition:\s*(.+?)(?=Discharge Instructions|$)",
    "Discharge Instructions": r"Discharge Instructions:\s*(.+?)(?=Followup Instructions|$)",
    "Followup Instructions": r"Followup Instructions:\s*(.+)$"
}

# Extract sections using regex
data = {}
for key, pattern in patterns.items():
    match = re.search(pattern, first_row_text, re.DOTALL)
    if match:
        data[key] = match.group(1).strip()
    else:
        data[key] = None

# Convert the data dictionary into a pandas DataFrame
df_new = pd.DataFrame([data])

print(df_new)

                     Chief Complaint Major Surgical or Invasive Procedure  \
0  Worsening ABD distension and pain                         Paracentesis   

       Discharge Diagnosis                              Discharge Medications  \
0  Ascites from Portal HTN  1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezin...   

                                 Discharge Condition  \
0  Mental Status: Clear and coherent.\nLevel of C...   

                              Discharge Instructions Followup Instructions  
0  Dear Ms. ___,\nIt was a pleasure taking care o...                   ___  


In [6]:
display(HTML(df_new.head(2).to_html()))

Unnamed: 0,Chief Complaint,Major Surgical or Invasive Procedure,Discharge Diagnosis,Discharge Medications,Discharge Condition,Discharge Instructions,Followup Instructions
0,Worsening ABD distension and pain,Paracentesis,Ascites from Portal HTN,"1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezing, SOB \n2. Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY \n3. Furosemide 40 mg PO DAILY \nRX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet \nRefills:*3\n4. Ipratropium Bromide Neb 1 NEB IH Q6H SOB \n5. Nicotine Patch 14 mg TD DAILY \n6. Raltegravir 400 mg PO BID \n7. Spironolactone 50 mg PO DAILY \n8. Acetaminophen 500 mg PO Q6H:PRN pain \n\n \nDischarge Disposition:\nHome",Mental Status: Clear and coherent.\nLevel of Consciousness: Alert and interactive.\nActivity Status: Ambulatory - Independent.,"Dear Ms. ___,\nIt was a pleasure taking care of you! You came to us with \nstomach pain and worsening distension. While you were here we \ndid a paracentesis to remove 1.5L of fluid from your belly. We \nalso placed you on you 40 mg of Lasix and 50 mg of Aldactone to \nhelp you urinate the excess fluid still in your belly. As we \ndiscussed, everyone has a different dose of lasix required to \nmake them urinate and it's likely that you weren't taking a high \nenough dose. Please take these medications daily to keep excess \nfluid off and eat a low salt diet. You will follow up with Dr. \n___ in liver clinic and from there have your colonoscopy \nand EGD scheduled. Of course, we are always here if you need us. \nWe wish you all the best!\nYour ___ Team.",___


Extracting text for each section

In [73]:
discharger_instruction = df_new.at[0,"Discharge Instructions"]
discharger_condition = df_new.at[0,"Discharge Condition"]
discharger_medications = df_new.at[0,"Discharge Medications"]
discharger_diagnosis = df_new.at[0,"Discharge Diagnosis"]
major_surgical = df_new.at[0,"Major Surgical or Invasive Procedure"]
chief_complaint = df_new.at[0,"Chief Complaint"]

Data Cleaning

In [70]:
def clean_text(x):
    x = " ".join(x.split())
    x= " ".join((" ".join(x.split("[**"))).split("**]"))
    x = re.sub(r"\([^()]*\)", "", x)
    key_value_strip =(x.split(":"))
    ##remove all sub strings which have a length lesser than 50 characters
    string = " ".join([sub_unit for sub_unit in key_value_strip if len(sub_unit)>50])
    x = re.sub(r"(\d+)+(\.|\))", "", string)## remove all serialization eg 1. 1)
    x = re.sub(r"(\*|\?|=)+", "", x) ##removing all *, ? and =
    x = re.sub(r"\b(\w+)( \1\b)+", r"\1", x) ## removing consecutive dupicate words
    x = x.replace("FOLLOW UP", "FOLLOWUP")
    x = x.replace("FOLLOW-UP", "FOLLOWUP")
    x = re.sub(r"(\b)(f|F)(irst)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)","",x)##remove firstname
    x = re.sub(r"(\b)(l|L)(ast)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)", "", x)
    x = re.sub(r"(\b)(d|D)\.?(r|R)\.?(\b)", "", x) #remove Dr abreviation
    x = re.sub(r"([^A-Za-z0-9\s](\s)){2,}", "", x)##remove consecutive punctuations
    

    return(x.replace("  ", " "))

df_new["Discharge Instructions"] = df_new["Discharge Instructions"].apply(lambda x: clean_text(x))




In [71]:
display(HTML(df_new.head(1).to_html()))

Unnamed: 0,Chief Complaint,Major Surgical or Invasive Procedure,Discharge Diagnosis,Discharge Medications,Discharge Condition,Discharge Instructions,Followup Instructions
0,Worsening ABD distension and pain,Paracentesis,Ascites from Portal HTN,"1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezing, SOB \n2. Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY \n3. Furosemide 40 mg PO DAILY \nRX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet \nRefills:*3\n4. Ipratropium Bromide Neb 1 NEB IH Q6H SOB \n5. Nicotine Patch 14 mg TD DAILY \n6. Raltegravir 400 mg PO BID \n7. Spironolactone 50 mg PO DAILY \n8. Acetaminophen 500 mg PO Q6H:PRN pain \n\n \nDischarge Disposition:\nHome",,"Dear Ms. ___, It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 5L of fluid from your belly. We also placed you on you 40 mg of Lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it's likely that you weren't taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with . ___ in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your ___ Team.",___


In [72]:
discharger_instruction = df_new.at[0,"Discharge Instructions"]
print(discharger_instruction)


Dear Ms. ___, It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 5L of fluid from your belly. We also placed you on you 40 mg of Lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it's likely that you weren't taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with . ___ in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your ___ Team.


In [38]:
cleaned_text = re.sub(r"(\bMs\.\s)_{2,}", r"\1[Name]", discharger_instruction)

# Standardizing medical terms (e.g., Lasix to lasix)
cleaned_text = re.sub(r"\bLasix\b", "lasix", cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r"\.\s_{2,}", ". [Clinic Name]", cleaned_text)  # Cleans ". ___"
cleaned_text = re.sub(r"\s_{2,}\s", " [Information]", cleaned_text)  # Cleans " ___ "

# Standardizing units (e.g., liters)
cleaned_text = re.sub(r"\b(\d+)L\b", r"\1 liters", cleaned_text)

print(cleaned_text)

Dear Ms. [Name], It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 5 liters of fluid from your belly. We also placed you on you 40 mg of lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it's likely that you weren't taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with . [Clinic Name] in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your [Information]Team.


Applying LLM API to summary the text

In [41]:

my_request = "Summary this: " + cleaned_text

print(my_request)


Summary this: Dear Ms. [Name], It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 5 liters of fluid from your belly. We also placed you on you 40 mg of lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it's likely that you weren't taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with . [Clinic Name] in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your [Information]Team.


In [43]:
import requests

API_TOKEN = "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"

API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	

output = query({
	"inputs": my_request
})

print(output)

[{'summary_text': 'Ms. [Name] came to us with stomach pain and worsening distension. We did a paracentesis to remove 5 liters of fluid from her belly. We also placed you on you 40 mg of lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly.'}]


Implement NLP

In [12]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 2.8 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 6.1 MB/s eta 0:00:02
     ---- ----------------------------------- 1.4/12.8 MB 8.3 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 9.5 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.8 MB 10.3 MB/s eta 0:00:01
     ---------- ----------------------------- 3.4/12.8 MB 10.8 MB/s eta 0:00:01
     ------------ --------------------------- 4.0/12.8 MB 11.2 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 11.5 MB/s eta 0:00:01
     ---------------- --------------

In [28]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [29]:
stopwords = list(STOP_WORDS)

In [30]:
nlp = spacy.load('en_core_web_sm')

In [48]:
text = discharger_instruction
print(text)

Dear Ms. ___, It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 5L of fluid from your belly. We also placed you on you 40 mg of Lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it's likely that you weren't taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with . ___ in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your ___ Team.


In [74]:
text =discharger_medications
print(text)

1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezing, SOB 
2. Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY 
3. Furosemide 40 mg PO DAILY 
RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet 
Refills:*3
4. Ipratropium Bromide Neb 1 NEB IH Q6H SOB 
5. Nicotine Patch 14 mg TD DAILY 
6. Raltegravir 400 mg PO BID 
7. Spironolactone 50 mg PO DAILY 
8. Acetaminophen 500 mg PO Q6H:PRN pain 

 
Discharge Disposition:
Home


In [75]:
doc = nlp(text)

In [76]:
tokens = [token.text for token in doc]
print(tokens)

['1', '.', 'Albuterol', 'Inhaler', '2', 'PUFF', 'IH', 'Q4H', ':', 'PRN', 'wheezing', ',', 'SOB', '\n', '2', '.', 'Emtricitabine', '-', 'Tenofovir', '(', 'Truvada', ')', '1', 'TAB', 'PO', 'DAILY', '\n', '3', '.', 'Furosemide', '40', 'mg', 'PO', 'DAILY', '\n', 'RX', '*', 'furosemide', '40', 'mg', '1', 'tablet(s', ')', 'by', 'mouth', 'Daily', 'Disp', '#', '*', '30', 'Tablet', '\n', 'Refills:*3', '\n', '4', '.', 'Ipratropium', 'Bromide', 'Neb', '1', 'NEB', 'IH', 'Q6H', 'SOB', '\n', '5', '.', 'Nicotine', 'Patch', '14', 'mg', 'TD', 'DAILY', '\n', '6', '.', 'Raltegravir', '400', 'mg', 'PO', 'BID', '\n', '7', '.', 'Spironolactone', '50', 'mg', 'PO', 'DAILY', '\n', '8', '.', 'Acetaminophen', '500', 'mg', 'PO', 'Q6H', ':', 'PRN', 'pain', '\n\n \n', 'Discharge', 'Disposition', ':', '\n', 'Home']


In [77]:
punctuation = punctuation + '\n'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\n'

In [78]:
word_frequencies = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [79]:
word_frequencies

{'1': 4,
 'Albuterol': 1,
 'Inhaler': 1,
 '2': 2,
 'PUFF': 1,
 'IH': 2,
 'Q4H': 1,
 'PRN': 2,
 'wheezing': 1,
 'SOB': 2,
 'Emtricitabine': 1,
 'Tenofovir': 1,
 'Truvada': 1,
 'TAB': 1,
 'PO': 5,
 'DAILY': 4,
 '3': 1,
 'Furosemide': 1,
 '40': 2,
 'mg': 6,
 'RX': 1,
 'furosemide': 1,
 'tablet(s': 1,
 'mouth': 1,
 'Daily': 1,
 'Disp': 1,
 '30': 1,
 'Tablet': 1,
 'Refills:*3': 1,
 '4': 1,
 'Ipratropium': 1,
 'Bromide': 1,
 'Neb': 1,
 'NEB': 1,
 'Q6H': 2,
 '5': 1,
 'Nicotine': 1,
 'Patch': 1,
 '14': 1,
 'TD': 1,
 '6': 1,
 'Raltegravir': 1,
 '400': 1,
 'BID': 1,
 '7': 1,
 'Spironolactone': 1,
 '50': 1,
 '8': 1,
 'Acetaminophen': 1,
 '500': 1,
 'pain': 1,
 '\n\n \n': 1,
 'Discharge': 1,
 'Disposition': 1,
 'Home': 1}

In [80]:
max_frequency = max(word_frequencies.values())

In [81]:
max_frequency

6

In [82]:
for word in word_frequencies.keys():  
    word_frequencies[word] = word_frequencies[word]/max_frequency

In [83]:
print(word_frequencies)

{'1': 0.6666666666666666, 'Albuterol': 0.16666666666666666, 'Inhaler': 0.16666666666666666, '2': 0.3333333333333333, 'PUFF': 0.16666666666666666, 'IH': 0.3333333333333333, 'Q4H': 0.16666666666666666, 'PRN': 0.3333333333333333, 'wheezing': 0.16666666666666666, 'SOB': 0.3333333333333333, 'Emtricitabine': 0.16666666666666666, 'Tenofovir': 0.16666666666666666, 'Truvada': 0.16666666666666666, 'TAB': 0.16666666666666666, 'PO': 0.8333333333333334, 'DAILY': 0.6666666666666666, '3': 0.16666666666666666, 'Furosemide': 0.16666666666666666, '40': 0.3333333333333333, 'mg': 1.0, 'RX': 0.16666666666666666, 'furosemide': 0.16666666666666666, 'tablet(s': 0.16666666666666666, 'mouth': 0.16666666666666666, 'Daily': 0.16666666666666666, 'Disp': 0.16666666666666666, '30': 0.16666666666666666, 'Tablet': 0.16666666666666666, 'Refills:*3': 0.16666666666666666, '4': 0.16666666666666666, 'Ipratropium': 0.16666666666666666, 'Bromide': 0.16666666666666666, 'Neb': 0.16666666666666666, 'NEB': 0.16666666666666666, '

In [84]:
sentence_tokens = [sent for sent in doc.sents]
print(sentence_tokens)

[1. Albuterol Inhaler 2 PUFF IH Q4H:, PRN wheezing, SOB 
2., Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY 
3., Furosemide 40 mg PO DAILY 
RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet 
Refills:*3
4., Ipratropium Bromide Neb 1 NEB IH Q6H SOB 
5., Nicotine Patch 14 mg TD DAILY 
6., Raltegravir 400, mg PO BID 
7., Spironolactone 50 mg PO DAILY 
8., Acetaminophen 500 mg PO Q6H:, PRN pain 

 
Discharge Disposition:
Home]


In [85]:
sentence_scores = {}
for sent in sentence_tokens:  
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()]

In [86]:
sentence_scores

{1. Albuterol Inhaler 2 PUFF IH Q4H:: 1.0,
 PRN wheezing, SOB 
 2.: 0.5,
 Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY 
 3.: 0.8333333333333333,
 Furosemide 40 mg PO DAILY 
 RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet 
 Refills:*3
 4.: 4.333333333333333,
 Ipratropium Bromide Neb 1 NEB IH Q6H SOB 
 5.: 0.8333333333333333,
 Nicotine Patch 14 mg TD DAILY 
 6.: 1.3333333333333335,
 Raltegravir 400: 0.16666666666666666,
 mg PO BID 
 7.: 1.1666666666666667,
 Spironolactone 50 mg PO DAILY 
 8.: 1.3333333333333335,
 Acetaminophen 500 mg PO Q6H:: 1.1666666666666667,
 PRN pain 
 
  
 Discharge Disposition:
 Home: 0.3333333333333333}

In [87]:
from heapq import nlargest

In [88]:
select_length = int(len(sentence_tokens)*0.3)
select_length

3

In [89]:
summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)

In [90]:
summary

[Furosemide 40 mg PO DAILY 
 RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet 
 Refills:*3
 4.,
 Nicotine Patch 14 mg TD DAILY 
 6.,
 Spironolactone 50 mg PO DAILY 
 8.]

In [91]:
finally_summary = [word.text for word in summary]

In [92]:
summary =''.join(finally_summary)

In [93]:
print(summary)

Furosemide 40 mg PO DAILY 
RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet 
Refills:*3
4.Nicotine Patch 14 mg TD DAILY 
6.Spironolactone 50 mg PO DAILY 
8.
