In [55]:
import pandas as pd
import json
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import spacy
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_sm")


In [62]:
project_path = os.getcwd()
test_df = pd.read_csv(os.path.join(project_path, "..", "..", "data", "processed", "test.csv"))
train_df = pd.read_csv(os.path.join(project_path, "..", "..", "data", "processed", "train.csv"))
# combine train and test and write to a new csv file
combined_df = pd.concat([train_df, test_df])
# remove "	Input:\n" from the discharge_report column  
combined_df['discharge_report'] = combined_df['discharge_report'].str.replace("Input:\n", "")
combined_df.to_csv(os.path.join(project_path, "..", "..", "data", "processed", "all.csv"), index=False)


In [66]:
# count number of "words" in the discharge_report column
report_word_count = []
for i, row in combined_df.iterrows():
    report_word_count.append(len(row['discharge_report'].split()))
print(f"Mean number of words in reports: {np.mean(report_word_count)}")

Mean number of words in reports: 1674.0497382198953


In [6]:
# PRINT THE LONGEST DISCHARGE SUMMARY
test_df['discharge_summary'].str.len().sort_values(ascending=False).head(1)


23    1925
Name: discharge_summary, dtype: int64

In [4]:
original_summaries_sentences = []
generated_summaries_sentences = []
for i, row in generated.iterrows():
    original_doc = nlp(row["discharge_summary"])
    original_summaries_sentences.append([sent.text for sent in original_doc.sents])
    generated_doc = nlp(row["generated_summary"])
    generated_summaries_sentences.append([sent.text for sent in generated_doc.sents])


NameError: name 'generated' is not defined

In [16]:
def write_sent_lines(summaries, file_path):
    for summary in summaries:
        with open(file_path, "a") as f:
            f.write("###SUMMARY: \n")
            for sent in summary:
                f.write(sent + "\n")
            f.write("\n\n")




In [17]:
write_sent_lines(original_summaries_sentences, os.path.join(project_path, "..", "..", "output", "few_shot_summaries", "original_summaries_sentences.txt"))
write_sent_lines(generated_summaries_sentences, os.path.join(project_path, "..", "..", "output", "few_shot_summaries", "generated_summaries_sentences.txt"))

In [3]:
# min, max, mean, std of the length of the discharge reports and summaries
lengths = train_df['discharge_report'].str.len()  # Calculate the length of each report

# Calculate min, max, mean, and std
min_length = lengths.min()
max_length = lengths.max()
mean_length = lengths.mean()
std_length = lengths.std()

# Print the results
print(f"Min length report: {min_length}")
print(f"Max length report: {max_length}")
print(f"Mean length report: {mean_length}")
print(f"Standard deviation report: {std_length}")

lengths = train_df['discharge_summary'].str.len()  # Calculate the length of each summary

# Calculate min, max, mean, and std
min_length = lengths.min()
max_length = lengths.max()
mean_length = lengths.mean()
std_length = lengths.std()

# Print the results
print(f"Min length summary: {min_length}")
print(f"Max length summary: {max_length}")
print(f"Mean length summary: {mean_length}")
print(f"Standard deviation summary: {std_length}")


Min length report: 3416
Max length report: 33209
Mean length report: 11007.517482517482
Standard deviation report: 4990.852398247897
Min length summary: 576
Max length summary: 2210
Mean length summary: 1145.2202797202797
Standard deviation summary: 322.84188255421503


In [12]:
output_path = os.path.join(project_path, '..', '..', 'output', 'zero_shot_summaries')
df_generated = pd.read_csv(os.path.join(output_path, 'test_generated.csv'))


In [13]:
df_generated.columns

Index(['discharge_report', 'discharge_summary', 'generated_summary'], dtype='object')

In [15]:
df_generated['discharge_summary'][0]

"The patient is a 51-year-old male who underwent a living non-related renal transplant due to end-stage renal disease (ESRD) caused by medication toxicity and chronic dehydration. He has a history of Crohn's disease, which was treated with total colectomy and ileoanal pullthrough 20 years ago. He has been on hemodialysis since then. The patient has allergies to sulfa, penicillins, and Asacol. His past medical history includes ESRD, interstitial nephritis, and Asacol on HD via RIJ tunnel cath. \n\nDuring his hospital stay, the patient received induction immunosuppression and underwent a successful transplant with a living donor. He experienced some complications, including decreased urine output and low blood pressure, which were managed with fluid replacement and medication adjustments. The patient was discharged with a stable condition and was instructed to follow up with the transplant office and to have regular lab tests. His medications at discharge included acetaminophen, dapsone,

In [16]:
df_generated['generated_summary'][0]


" \nThe patient is a 51-year-old male with a history of Crohn's disease and end-stage renal disease (ESRD) who underwent a living non-related renal transplant. He received induction immunosuppression and was started on Prograf, Cellcept, and Solu-Medrol. Postoperatively, he experienced a decrease in urine output, which was managed with fluid replacements and a decrease in Prograf dose. He was also started on Dapsone for PCP prophylaxis due to a sulfa allergy. He was discharged on a regimen of immunosuppressive medications, including Tacrolimus, Mycophenolate Mofetil, and Valganciclovir, as well as pain medication and anti-diarrheal medication. He was instructed to follow up with the transplant office and to have labs drawn regularly. He was also advised to avoid heavy lifting, straining, and driving while taking pain medication.\nThe final answer is: There is no final answer to this problem as it is a discharge report summary. However, the key points from the report are:\n\n* The patie

In [7]:
# GET pubmedbert tokenizer
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16")


config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [35]:
list_num_of_tokens_report = []
list_num_of_tokens_summary = []
for i, row in train_df.iterrows():
    report_input_ids = tokenizer(row['discharge_report'], return_tensors='pt')['input_ids']
    summary_input_ids = tokenizer(row['discharge_summary'], return_tensors='pt')['input_ids']
    list_num_of_tokens_report.append(report_input_ids.shape[1])
    list_num_of_tokens_summary.append(summary_input_ids.shape[1])

# max, min, mean, std of the number of tokens in the reports and summaries
print(f"Max number of tokens in reports: {max(list_num_of_tokens_report)}")
print(f"Min number of tokens in reports: {min(list_num_of_tokens_report)}")
print(f"Mean number of tokens in reports: {np.mean(list_num_of_tokens_report)}")
print(f"Std number of tokens in reports: {np.std(list_num_of_tokens_report)}")

print(f"Max number of tokens in summaries: {max(list_num_of_tokens_summary)}")
print(f"Min number of tokens in summaries: {min(list_num_of_tokens_summary)}")
print(f"Mean number of tokens in summaries: {np.mean(list_num_of_tokens_summary)}")
print(f"Std number of tokens in summaries: {np.std(list_num_of_tokens_summary)}")

#get 3 longest report's ids in a list
list_longest_report_ids = train_df['discharge_report'].str.len().sort_values(ascending=False).head(3).index.tolist()
longest_reports = train_df.iloc[list_longest_report_ids]['discharge_report'].tolist()
longest_report_summaries = train_df.iloc[list_longest_report_ids]['discharge_summary'].tolist()

def write_sent_lines(longest_reports, longest_report_summaries, file_path):
    with open(file_path, "w") as f:
        for i in range(len(longest_reports)):
            f.write("###REPORT: \n")
            f.write(longest_reports[i] + "\n\n")
            f.write("###SUMMARY: \n")
            doc = nlp(longest_report_summaries[i])
            for sent in doc.sents:
                f.write(sent.text + "\n")
            f.write("\n\n")
            
            
write_sent_lines(longest_reports, longest_report_summaries, os.path.join(project_path, '..', '..', 'scripts', 'data_exploration', 'longest_report_and_summary.txt'))


Max number of tokens in reports: 9047
Min number of tokens in reports: 888
Mean number of tokens in reports: 3052.8321678321677
Std number of tokens in reports: 1423.0700437626888
Max number of tokens in summaries: 523
Min number of tokens in summaries: 112
Mean number of tokens in summaries: 272.9300699300699
Std number of tokens in summaries: 78.4147258852535


In [18]:
report_total_tokens = 0 
summary_total_tokens = 0
for i, row in test_df.iterrows():
    report_input_ids = tokenizer(row['discharge_report'], return_tensors='pt')['input_ids']
    summary_input_ids = tokenizer(row['discharge_summary'], return_tensors='pt')['input_ids']
    report_total_tokens += report_input_ids.shape[1]
    summary_total_tokens += summary_input_ids.shape[1]
    
print(f"Total tokens in test set reports: {report_total_tokens}")
print(f"Average tokens in test set reports: {report_total_tokens / len(test_df)}")
print(f"Total tokens in test set summaries: {summary_total_tokens}")
print(f"Average tokens in test set summaries: {summary_total_tokens / len(test_df)}")



Total tokens in test set reports: 299322
Average tokens in test set reports: 3117.9375
Total tokens in test set summaries: 25759
Average tokens in test set summaries: 268.3229166666667


In [37]:
txt = """
A 68-year-old male came to the hospital because he fell while putting away Christmas decorations.
After three hours, he became disoriented and confused while talking to his family members.
His vital signs were checked, revealing that he had low blood pressure.
Anemia, low glucose levels, elevated lactate level, high sodium level, and high creatinine level were noted during his lab tests.
His vital signs were closely monitored.
An initial CT scan revealed two small subdural hematomas behind his ears.
After being treated by multiple specialists including neurosurgery, and undergoing further tests like MRI scans and a lumbar puncture, it became apparent that there was bleeding within the brain cavity behind his ears.
After extensive treatment, including medication to prevent seizures and managing fluid balance within his skull, the patient made progress towards recovery.
Eventually, he underwent procedures such as IVC filter placement and PEG tube insertion through surgery.
Eventually cleared out the blood around his ears, making significant progress toward stability.
However, neurological assessment indicated slight decrease in motor function on both sides of his legs; nonetheless, his overall health improved enough to make a full recovery.    
<br> <br>  )<br>  )
<br>  )
<br>  )
<br>  )
<br>   
"""

In [39]:
doc = nlp(txt)
for sent in doc.sents:
    sentence = sent.text
    print(sentence)
    print(len(sentence))


A 68-year-old male came to the hospital because he fell while putting away Christmas decorations.

After three hours, he became disoriented and confused while talking to his family members.

His vital signs were checked, revealing that he had low blood pressure.

Anemia, low glucose levels, elevated lactate level, high sodium level, and high creatinine level were noted during his lab tests.

His vital signs were closely monitored.

An initial CT scan revealed two small subdural hematomas behind his ears.

After being treated by multiple specialists including neurosurgery, and undergoing further tests like MRI scans and a lumbar puncture, it became apparent that there was bleeding within the brain cavity behind his ears.

After extensive treatment, including medication to prevent seizures and managing fluid balance within his skull, the patient made progress towards recovery.

Eventually, he underwent procedures such as IVC filter placement and PEG tube insertion through surgery.

Even

In [54]:
from constants import LLM_NAME

ModuleNotFoundError: No module named 'constants'