In [None]:
# Install dependencies first if you haven't:
# pip install pandas numpy scikit-learn nltk spacy gensim matplotlib seaborn wordcloud
# python -m spacy download en_core_web_sm

import pandas as pd
import spacy
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np 

In [None]:


# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# -------------------------
# Step 1: Create Synthetic Clinical Notes
# -------------------------
clinical_notes = [
    "Patient presents with chest pain, shortness of breath, and a history of hypertension.",
    "Complains of severe headache and blurred vision. Diagnosed with migraine.",
    "Reports lower back pain, previously treated with ibuprofen, no known allergies.",
    "History of diabetes and currently taking insulin therapy.",
    "Admitted for abdominal pain; ultrasound suggests gallstones.",
    "Persistent cough and fever; chest X-ray shows signs of pneumonia.",
    "Diagnosed with asthma during childhood, prescribed inhaler therapy.",
    "Patient undergoing chemotherapy for breast cancer, experiencing nausea.",
    "Reports joint stiffness and swelling, probable rheumatoid arthritis.",
    "Follow-up for coronary artery disease, recent angioplasty completed."
]

data = pd.DataFrame({'notes': clinical_notes})

# -------------------------
# Step 2: Preprocessing
# -------------------------
stop_words = set(stopwords.words('english'))

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(tokens)

data['processed_notes'] = data['notes'].apply(preprocess)

# -------------------------
# Step 3: Named Entity Recognition (NER)
# -------------------------
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

data['entities'] = data['notes'].apply(extract_entities)

# Display extracted entities
print("\n--- Named Entities Extracted ---\n")
print(data[['notes', 'entities']])

# -------------------------
# Step 4: Topic Modeling (LDA)
# -------------------------
vectorizer = CountVectorizer(max_df=0.9, min_df=1, stop_words='english')
dtm = vectorizer.fit_transform(data['processed_notes'])

lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(dtm)

def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}: ", [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

print("\n--- Topics Discovered ---")
display_topics(lda, vectorizer.get_feature_names_out(), 10)

# -------------------------
# Step 5: Word Cloud Visualization
# -------------------------
all_text = " ".join(data['processed_notes'].tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Clinical Notes", fontsize=16)
plt.show()
