# Data Science & LLM Technical Assessment Part 2

In [None]:
import pandas as pd
path = './data.csv'
df = pd.read_csv(path)

#print dataframe shape
print("Dataframe shape:", df.shape)

## Exploratory data analysis

### Sample input

In [None]:
#create nlp dataframe
nlp_df = df[['discharge_note','readmitted_30_days']].copy()

#sample nlp_df
nlp_df['discharge_note'][0]

### Get descriptive statistics

In [None]:
import string
import nltk
#download libs
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#get stopwords
stop_words = set(stopwords.words('english'))

#create function to produce descriptive stats on text features
def GetTextStats(text):
    """
    :des:
    :inp:
    :out:
    """

    #tokenize words in lowercase
    words = word_tokenize(text.lower())

    #remove any non-alphabetic characters
    words_clean = [w for w in words if w.isalpha()]

    #create table with preditive stats
    stats_df = pd.Series({
        'char_count': len(text),
        'word_count': len(words_clean),
        'avg_word_length': sum(len(w) for w in words_clean) / max(len(words_clean), 1),
        'unique_words': len(set(words_clean)),
        'stopword_ratio': sum(w in stop_words for w in words_clean) / max(len(words_clean), 1),
        'punctuation_count': sum(1 for c in text if c in string.punctuation)
    })
    
    return stats_df

#apply text_stats
text_stats_df = nlp_df['discharge_note'].apply(GetTextStats)

#combine with original df
nlp_df = nlp_df.join(text_stats_df)

#view df
nlp_df.head(2)

### Visualise descriptive features

#### char_count

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='char_count',
    points='all',  # Show individual data points
    title='Distribution of Character Counts in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#update formatting
fig.update_layout(
    yaxis_title='Character Count',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()


#### word_count

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='word_count',
    points='all',  # Show individual data points
    title='Distribution of Word Counts in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#update formatting
fig.update_layout(
    yaxis_title='Word Count',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()

#### avg_word_length

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='avg_word_length',
    points='all',  # Show individual data points
    title='Distribution of Average Word Length in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#update formatting
fig.update_layout(
    yaxis_title='Average Word Length',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()

#### unique_words

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='unique_words',
    points='all',  # Show individual data points
    title='Distribution of Unique Words in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#update formatting
fig.update_layout(
    yaxis_title='Unique Word Count',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()

#### stopword_ratio

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='stopword_ratio',
    points='all',  # Show individual data points
    title='Distribution of Stopword Ratio in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#update formatting
fig.update_layout(
    yaxis_title='Stopword Ratio',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()

#### punctuation_count

In [None]:
import plotly.express as px

#create boxplot for character count
fig = px.box(
    nlp_df,
    y='punctuation_count',
    points='all',  # Show individual data points
    title='Distribution of Puncutation Count in Discharge Notes',
    boxmode='overlay'
)

#specify height and width
height, width = 600, 600

#formatting
fig.update_layout(
    yaxis_title='Punctuation Count',
    template='plotly_white',
    showlegend=False,
    height=height,
    width=width
)

#show plot
fig.show()

#### WordCloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

#assign stopwords
stop_words = set(stopwords.words('english'))

#combine all discharge notes into one string
text = " ".join(nlp_df['discharge_note'].dropna().astype(str).tolist())

#remove stopwords
filtered_words = [word for word in text.split() if word.lower() not in stop_words]
filtered_text = " ".join(filtered_words)

#create word cloud
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    stopwords=stop_words,
    collocations=False  # prevent joining of common word pairs
).generate(filtered_text)

#show plot
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Observations

- Discharge notes were similar in form, showing small ranges in word count, number of unique words, and average word length when compared to the range typically observed in a broader corpus.  
- The word cloud showed that neutral words such as *"patient"*, *"advised"*, and *"discharged"* were common. Words with negative medical sentiment (e.g., *"complications"*, *"pneumonia"*) also appeared, as well as words with positive sentiment such as *"improvement"*.  


### Wordcount between readmitted an non-readmitted

#### Split dataframe

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

#ensure stopwords are available
stop_words = set(stopwords.words('english'))

#function to clean and tokenize discharge notes
def preprocess_discharge_note(text):
    if pd.isnull(text):
        return []
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    return [word for word in words if word not in stop_words]

#apply tokenization before splitting
nlp_df['clean_tokens'] = nlp_df['discharge_note'].apply(preprocess_discharge_note)

#split by readmission status
nlp_df_1 = nlp_df[nlp_df['readmitted_30_days'] == 1].copy()
nlp_df_0 = nlp_df[nlp_df['readmitted_30_days'] == 0].copy()

#show shape
nlp_df_1.shape

#### Compare prevalent sentences across outcome variable

In [None]:
#look at number of unique values
nlp_df['discharge_note'].unique()

In [None]:
#look at value counts clean tokens
nlp_df_1['clean_tokens'].value_counts()

In [None]:
#look at value counts clean tokens
nlp_df_0['clean_tokens'].value_counts()

In [None]:
#get flat list of tokens
tokens_0 = pd.Series([x for x in nlp_df_0['clean_tokens'].values for x in x])
tokens_1 = pd.Series([x for x in nlp_df_1['clean_tokens'].values for x in x])

#calculate value counts
tokens_0_counts = tokens_0.value_counts()
tokens_1_counts = tokens_1.value_counts()

#combine into DataFrame
token_counts_df = pd.DataFrame({
    'non_readmitted_count': tokens_0_counts,
    'readmitted_count': tokens_1_counts
}).fillna(0).astype(int)

#get total token counts per group
total_tokens_0 = tokens_0.shape[0]
total_tokens_1 = tokens_1.shape[0]

#add proportion columns
token_counts_df['prop_non_readmitted_count'] = token_counts_df['non_readmitted_count'] / total_tokens_0
token_counts_df['prop_readmitted_count'] = token_counts_df['readmitted_count'] / total_tokens_1
token_counts_df['prop_difference'] = abs(token_counts_df['prop_readmitted_count'] - token_counts_df['prop_non_readmitted_count'])

#sort by by proportion difference
token_counts_df.sort_values(by='prop_difference', ascending=False, inplace=True)

#show df
token_counts_df.head(10)

#### Observations

- There are only 10 unique `discharge_notes` in the dataset.  
- Proportional differences in specific terms can be observed across the outcome variable — for example, *"blood pressure"* appeared marginally more frequently in the positive class of the outcome variable.  

## Entity extraction

### Create ent_df

In [None]:
import pandas as pd
import numpy as np

#create unique discharge notes
sentences = np.array([
    'Good recovery trajectory. Follow-up scan scheduled next month.',
    'Stable post-surgery. Advised to avoid physical exertion.',
    'Symptoms controlled. Monitoring for relapse advised.',
    'Discharge after recovery from pneumonia. No complications observed.',
    'Patient discharged in stable condition. Recommend follow-up in 2 weeks.',
    'Patient showed improvement. Prescribed antibiotics for 5 days.',
    'Blood pressure under control. Continue current medication.',
    'Patient discharged with minor discomfort. Advised rest and hydration.',
    'No further signs of infection. Resume normal diet and activity.',
    'Mild reaction to medication. Switched to alternative treatment.'
])

#create DataFrame
ent_df = pd.DataFrame(sentences, columns=["discharge_note"])

#display the result
ent_df.head(2)

### Flan-t5

#### Load model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#define model
model_name = "google/flan-t5-base"
device = 'cpu'

#load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

#### Create function to extract entities

In [None]:
import pandas as pd
from tqdm import tqdm

#enable tqdm for pandas apply
tqdm.pandas()

#function to format a prompt using a discharge note
def generate_prompt(note, custom_prompt=None):
    assert custom_prompt is not None, "You must provide a custom_prompt."
    return custom_prompt.format(note=note)

#function to run LLM inference
def extract_entity(note, custom_prompt):
    prompt = generate_prompt(note, custom_prompt)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

#### Extract entities

In [None]:
#define prompts
diagnosis_prompt = """Does the following discharge note contain a diagnosis? 
Answer with "1" if a diagnosis is present, or "0" if there is no diagnosis.
Discharge note: "{note}"
"""

treatment_prompt = """Does the following discharge note contain a treatment? 
Answer with "1" if a treatment is present, or "0" if there is no treatment.
Discharge note: "{note}"
"""

symptoms_prompt = """Does the following discharge note contain symptoms? 
Answer with "1" if symptoms are present, or "0" if there are no symptoms.
Discharge note: "{note}"
"""

medications_prompt = """Does the following discharge note mention any medications? 
Answer with "1" if a medication is present, or "0" if there is no medication.
Discharge note: "{note}"
"""

followup_prompt = """Does the following discharge note include any follow-up actions? 
Answer with "1" if a follow-up action is present, or "0" if there is no follow-up action.
Discharge note: "{note}"
"""

#apply prompts
ent_df['diagnosis'] = ent_df['discharge_note'].progress_apply(
    lambda note: extract_entity(note, diagnosis_prompt)
)

ent_df['treatment'] = ent_df['discharge_note'].progress_apply(
    lambda note: extract_entity(note, treatment_prompt)
)

ent_df['symptoms'] = ent_df['discharge_note'].progress_apply(
    lambda note: extract_entity(note, symptoms_prompt)
)

ent_df['medications'] = ent_df['discharge_note'].progress_apply(
    lambda note: extract_entity(note, medications_prompt)
)

ent_df['follow_up'] = ent_df['discharge_note'].progress_apply(
    lambda note: extract_entity(note, followup_prompt)
)

#show df
ent_df.head(2)

#### Check outputs

In [None]:
#iterate through text and show corresponding outputs
for idx, row in ent_df.iterrows():
    print(f"[{idx}] {row['discharge_note']}")
    print(f"     diagnosis={row['diagnosis']} | treatment={row['treatment']} | symptoms={row['symptoms']} | medications={row['medications']} | follow-up={row['follow_up']}")
    print()

In [None]:
#define true labels
y_true = {
    "diagnosis":      [0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
    "treatment":      [0, 1, 0, 0, 0, 1, 1, 1, 0, 1],
    "symptoms":       [0, 0, 1, 0, 0, 0, 1, 1, 0, 1],
    "medication":     [0, 0, 0, 0, 0, 1, 1, 0, 0, 1],
    "follow-up":      [1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
}

#define predictions
y_pred = {
    "diagnosis":      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    "treatment":      [1, 1, 1, 0, 1, 1, 1, 1, 0, 0],
    "symptoms":       [0, 1, 1, 0, 0, 1, 1, 1, 0, 1],
    "medication":     [0, 0, 1, 0, 0, 1, 1, 0, 0, 1],
    "follow-up":      [1, 0, 1, 0, 1, 1, 1, 1, 0, 0]
}

#create DataFrame with snake_case headers
results_df = pd.DataFrame({
    "text_index": list(range(10)),
    "diagnosis_true": y_true["diagnosis"],
    "diagnosis_pred": y_pred["diagnosis"],
    "treatment_true": y_true["treatment"],
    "treatment_pred": y_pred["treatment"],
    "symptoms_true": y_true["symptoms"],
    "symptoms_pred": y_pred["symptoms"],
    "medication_true": y_true["medication"],
    "medication_pred": y_pred["medication"],
    "follow_up_true": y_true["follow-up"],
    "follow_up_pred": y_pred["follow-up"],
})

#show DataFrame
results_df.head(2)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

#calculate metrics for each entity
metrics = []
for entity in y_true.keys():
    acc = accuracy_score(y_true[entity], y_pred[entity])
    prec = precision_score(y_true[entity], y_pred[entity], zero_division=0)
    rec = recall_score(y_true[entity], y_pred[entity], zero_division=0)
    metrics.append({
        "entity": entity,
        "accuracy": round(acc, 2),
        "precision": round(prec, 2),
        "recall": round(rec, 2)
    })

#show df
metrics_df = pd.DataFrame(metrics)
metrics_df

#### Observations

- Diagnosis: The model achieved high accuracy (0.80) but failed to identify any true positives, resulting in 0.00 precision and recall. This confirms earlier observations that the model struggles to detect diagnoses, especially when they are implied or refer to resolved conditions.
- Treatment: Performance was mixed, with moderate accuracy (0.60), good recall (0.80), but lower precision (0.57), indicating the model often predicts treatment where it isn’t present (i.e. false positives).
- Symptoms: Strong overall performance with high precision (0.67) and perfect recall (1.00). The model effectively identified all true symptom cases but included a few extra predictions.
- Medication: The model performed best here, with high accuracy (0.90) and both precision and recall at or near 1.00, showing it reliably detects explicit medication mentions.
- Follow-up: The model captured all true follow-up instances (recall = 1.00) but with lower precision (0.33), suggesting frequent over-prediction of follow-up when it was not actually mentioned.


## Discussion on using LLMs for medical entity tagging

#### Risks and Limitations of Using LLMs in Clinical Entity Extraction

Large language models (LLMs) like Flan-T5 offer strong zero-shot and few-shot capabilities, but applying them in clinical NLP tasks presents several challenges. These include hallucination, entity ambiguity, and limitations stemming from general-purpose training. My use of Flan-T5 to extract clinical entities from discharge notes highlights several of these risks:

- Hallucination and Omission:  
  When a model generates information that isn’t present in the input (hallucination) or fails to extract relevant information that is present (omission). In clinical tasks, this can lead to missing important diagnoses or fabricating details.

- Entity Ambiguity:  
  Occurs when similar or overlapping concepts (e.g., treatment vs medication) confuse the model, leading to inconsistent or incorrect tagging of entities.

- Prompt Sensitivity and Inconsistency:  
  Refers to how small changes in prompt wording or structure can lead to different outputs. Multi-entity prompts can overwhelm the model, reducing consistency across tasks.

- Limitations of General-Purpose Models:  
  General LLMs are not trained on domain-specific data like clinical notes, so they may lack medical knowledge, context awareness, or the ability to interpret subtle clinical language without fine-tuning.

#### Observations from Model Output

The Flan-T5 model showed all of these problems in my results. It missed both true diagnosis cases, even though the conditions (like pneumonia) were clearly mentioned. It also confused `treatment` and `medication`, especially in sample [9], where both were present but not tagged. When I used one prompt to extract all entities, the outputs were inconsistent — sometimes missing tags completely. This shows that while the model is capable, it’s not yet reliable for clinical tasks without more targeted training or support.
