In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from gensim.models import Word2Vec
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Load the dataset
file_path = '../data/ED-triage-obs-final.xlsx'  # Update with your local file path
df = pd.read_excel(file_path)

df.head()

In [None]:
# Map triage levels to ranges
def map_to_range(level):
    if level in [1,2]:
        return "Range 1"
    elif level == 3:
        return "Range 2"
    elif level in [4,5]:
        return "Range 3"

# Apply mapping to the triage levels
df['Triage-Range'] = df['Triage'].apply(map_to_range)

df.head()

In [None]:
df.shape

In [None]:
# Drop duplicate rows
df = df.drop_duplicates()

In [None]:
# Drop unnecessary columns
df = df.drop(columns=["Blood Glucose, Capillary", "Departed", "Arrived", "Departure Status", 'Diastolic Blood Pressure', 'Temperature Tympanic', 'Respiratory Rate'])

In [None]:
# Drop rows with missing 'Triage', 'Chief Complaint', and 'Visit Reason'.
df.dropna(subset=['Triage', 'Chief Complaint', 'Visit Reason', 'Systolic Blood Pressure', "SpO2", "Peripheral Pulse Rate"],inplace=True)

In [None]:
df.shape

In [None]:
# Initialize NLTK resources
stop_words = set(stopwords.words('english')) - {"no", "not", "wasn't", "was not", "isn't", "is not"}
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

In [None]:
# Define valid triage levels
valid_triage_levels = [1, 2, 3, 4, 5]

# Drop rows that do not contain valid triage levels
df = df[df['Triage'].isin(valid_triage_levels)]

In [None]:
# Preprocess Visit Reason and Chief Complaint
df['Visit Reason'] = df['Visit Reason'].apply(preprocess_text)
df['Chief Complaint'] = df['Chief Complaint'].apply(preprocess_text)

# Combine Visit Reason and Chief Complaint
df['combined_text'] = df['Visit Reason'] + ' ' + df['Chief Complaint']

# Preprocess the combined_text column
sentences = df['combined_text'].apply(lambda x: x.split())

In [None]:
# Step 2: Train Word2Vec model on the combined_text column
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, seed=42)

word2vec_model_path = "models/word2vec.model"
word2vec_model.save(word2vec_model_path)
print(f"Word2Vec model saved at {word2vec_model_path}")

In [None]:
# Convert combined_text to a vector by averaging word embeddings
def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

df['text_embedding'] = df['combined_text'].apply(lambda x: get_sentence_embedding(x, word2vec_model))

In [None]:
# Extract the vital signs
vital_signs = df[['SpO2', 'Peripheral Pulse Rate', 'Systolic Blood Pressure']].values

In [None]:
# Step 4: Scale the text embeddings and vital signs separately
scaler_embeddings = StandardScaler()
text_embeddings_scaled = scaler_embeddings.fit_transform(np.vstack(df['text_embedding']))

scaler_vitals = StandardScaler()
vital_signs_scaled = scaler_vitals.fit_transform(vital_signs)

# Save the scalers
scaler_embeddings_path = "models/scaler-embeddings.pkl"
with open(scaler_embeddings_path, "wb") as f:
    pickle.dump(scaler_embeddings, f)
print(f"Scaler for embeddings saved at {scaler_embeddings_path}")

scaler_vitals_path = "models/scaler-vitals.pkl"
with open(scaler_vitals_path, "wb") as f:
    pickle.dump(scaler_vitals, f)
print(f"Scaler for vitals saved at {scaler_vitals_path}")

# Combine the scaled text embeddings and scaled vital signs
X = np.hstack((text_embeddings_scaled, vital_signs_scaled))

In [None]:
y = df['Triage']

In [None]:
# Visualize data before applying SMOTE 
df['Triage'] = df['Triage'].astype('category')

# Plot the distribution of Triage levels
plt.figure(figsize=(8, 6))
sns.countplot(x='Triage', data=df, hue='Triage', palette='Blues', legend=False)
plt.title('Triage level Distribution before applying SMOTE')
plt.xlabel('Triage Level')
plt.ylabel('Count')
plt.show()

In [None]:
# Apply SMOTE 
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Plot the distribution after SMOTE
resampled_df = pd.DataFrame({'Triage': y_resampled})
resampled_df['Triage'] = resampled_df['Triage'].astype('category')

plt.figure(figsize=(8, 6))
sns.countplot(x='Triage', data=resampled_df, hue='Triage', palette='Blues', legend=False)
plt.title('Triage Level Distribution After SMOTE')
plt.xlabel('Triage Level')
plt.ylabel('Count')
plt.show()

In [None]:
# Map resampled Triage levels to Triage-Range
resampled_df['Triage-Range'] = resampled_df['Triage'].apply(map_to_range)

# Update the target variable to the resampled Triage-Range
y_resampled_range = resampled_df['Triage-Range']

In [None]:
# Split data for Triage Range classification
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled_range, test_size=0.2, random_state=42)

In [None]:
# Train the Random Forest Classifier for Triage Range
range_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
range_classifier.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = range_classifier.predict(X_test)

In [None]:
# Evaluate the classifier
print("Classification Report for Triage Range:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix for Triage Range:")
print(confusion_matrix(y_test, y_pred))

# Calculate and display accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Save the range classifier
range_model_path = "models/classifier3.pkl"
with open(range_model_path, "wb") as f:
    pickle.dump(range_classifier, f)
print(f"Triage Range Random Forest model saved at {range_model_path}")


In [None]:
# Generate classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report).transpose()
# Filter the report DataFrame to include only the relevant classes
filtered_report_df = report_df.loc[['Range 1', 'Range 2', 'Range 3']]

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(filtered_report_df, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Classification Report Heatmap")
plt.xlabel("Metrics")
plt.ylabel("Triage Ranges")
plt.show()