In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pickle
import gensim.downloader as api
from tensorflow.keras.layers import Input
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

In [None]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Path to the file
file_path = '../data/ED-triage-obs-final.xlsx'  # Update with your local file path
df = pd.read_excel(file_path)
df = df[df['Triage'].isin([1, 2])].drop_duplicates()
df.shape

In [None]:
# Initialize NLTK resources
stop_words = set(stopwords.words('english')) - {"no", "not", "wasn't", "was not", "isn't", "is not"}
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

# Drop unnecessary columns
df = df.drop(columns=["Blood Glucose, Capillary", "Departed", "Arrived", "Diastolic Blood Pressure", "Departure Status", "Respiratory Rate", "Temperature Tympanic"])

# Drop rows with missing 'Triage', 'Chief Complaint', 'Visit Reason', and any of the vital signs
df.dropna(subset=['Triage', 'Chief Complaint', 'Visit Reason', 'SpO2', 'Peripheral Pulse Rate', 'Systolic Blood Pressure'], inplace=True)

In [None]:
# Preprocess Visit Reason and Chief Complaint
df['Visit Reason'] = df['Visit Reason'].apply(preprocess_text)
df['Chief Complaint'] = df['Chief Complaint'].apply(preprocess_text)

# Combine Visit Reason and Chief Complaint
df['combined_text'] = df['Visit Reason'] + ' ' + df['Chief Complaint']

# Preprocess the combined_text column
sentences = df['combined_text'].apply(lambda x: x.split())

In [None]:
# Step 1: Load Pre-trained GloVe Embeddings
glove_model = api.load("glove-wiki-gigaword-200")

In [None]:
# Step 2: Convert combined_text to a vector using pre-trained GloVe embeddings
def get_sentence_embedding_glove(sentence, glove_model):
    words = sentence.split()
    word_vecs = [glove_model[word] for word in words if word in glove_model]
    if len(word_vecs) == 0:
        return np.zeros(200)  # GloVe is 200-dimensional
    return np.mean(word_vecs, axis=0)

df['text_embedding'] = df['combined_text'].apply(lambda x: get_sentence_embedding_glove(x, glove_model))

# Extract the vital signs
vital_signs = df[['SpO2', 'Peripheral Pulse Rate', 'Systolic Blood Pressure']].values

In [None]:
try:
    # Step 3: Scale the text embeddings and vital signs together
    text_embeddings = np.vstack(df['text_embedding'].values)
    X_combined = np.hstack((text_embeddings, vital_signs))

    # Standardize the combined features
    scaler = StandardScaler()
    X_combined_scaled = scaler.fit_transform(X_combined)

    # Save the scaler
    scaler_path = "../models/fcdnn/spe2/scaler.pkl"
    with open(scaler_path, "wb") as f:
        pickle.dump(scaler, f)

    # Split the data into features and labels
    X = X_combined_scaled

except Exception as e:
    print("Error:", e)

In [None]:
y = df['Triage'] - 1

In [None]:
# Plot class distribution using Seaborn
sns.countplot(x='Triage', data=df, palette='pastel')
plt.title('Class Distribution Before Embedding')
plt.xlabel('Triage Level')
plt.ylabel('Number of Instances')
plt.show()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Combine X_train and y_train into a DataFrame for resampling
train_df = pd.DataFrame(X_train)
train_df['Triage'] = y_train.values

# Separate classes
majority = train_df[train_df['Triage'] == 1]
minority = train_df[train_df['Triage'] == 0]

# Oversample the minority class by duplication
minority_oversampled = resample(minority, 
                                replace=True, 
                                n_samples=len(majority), 
                                random_state=42)

# Combine oversampled minority with the majority class
balanced_train_df = pd.concat([majority, minority_oversampled])

X_train_resampled = balanced_train_df.drop(columns=['Triage']).values
y_train_resampled = balanced_train_df['Triage'].values

sns.countplot(x=y_train_resampled, palette='pastel')
plt.title('Class Distribution After Resampling')
plt.xlabel('Triage Level')
plt.ylabel('Number of Instances')
plt.show()

In [None]:
# Build the FCDNN for classification
model = Sequential()

# Input layer
model.add(Input(shape=(X_train.shape[1],)))

# First hidden layer
model.add(Dense(128))
model.add(LeakyReLU(negative_slope=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Second hidden layer
model.add(Dense(64))
model.add(LeakyReLU(negative_slope=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Third hidden layer
model.add(Dense(32, activation='relu'))

# Output layer for multiclass classification
model.add(Dense(2, activation='softmax'))  

In [None]:
# Compile the model with sparse categorical crossentropy for classification
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64,
                    validation_split=0.2, verbose=1,
                    callbacks=[early_stopping, lr_scheduler])

# Save the trained model in .h5 format
model_path = "../models/fcdnn/spe2/fcdnn_model.keras"
model.save(model_path)
print(f"Neural Network model saved at {model_path}")

# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Convert predictions to class labels (shift back to 1-5)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report
print(classification_report(y_test, y_pred_classes))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=['Triage 1', 'Triage 2'], yticklabels=['Triage 1', 'Triage 2'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
