# MEDHACK LSTM Test

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Load dataset
dataset_df = pd.read_csv('train_data.csv')
print("Data Loaded! Shape:", dataset_df.shape)
print(dataset_df.head())

# Load test data
test_df = pd.read_csv('test_data.csv')
print("Test Data Loaded! Shape:", test_df.shape)
print(test_df.head())


# 2.1: Drop columns we don't want or need from training data
# They might not directly help us predict in a simple DNN approach.
dataset_df = dataset_df.drop([
    'first_name', 'last_name',
    'address', 'city', 'state', 'postcode'
], axis=1)

test_df = test_df.drop([
    'first_name', 'last_name',
    'address', 'city', 'state', 'postcode'
], axis=1)

Data Loaded! Shape: (12055680, 16)
             timestamp                            patient_id first_name  \
0  2025-01-01 19:00:00  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
1  2025-01-01 19:00:05  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
2  2025-01-01 19:00:10  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
3  2025-01-01 19:00:15  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
4  2025-01-01 19:00:20  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   

       last_name  age gender           address       city state  postcode  \
0  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
1  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
2  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
3  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
4  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   

   diastolic_bp  systolic_bp  heart_rate  respirato

In [11]:
# =========================================================
# 2. Preprocessing
# =========================================================


# 2.1: Check for missing values
print("Missing values in training data:\n", dataset_df.isna().sum())
# print("Missing values in test data:\n", test_df.isna().sum())

# Example strategy: just drop rows with missing data.
# (Real-world might do more nuanced imputation.)
dataset_df = dataset_df.dropna()
# test_df = test_df.dropna()

# 2.2: Extract features (X) and labels (y) from the training set
X_all = dataset_df.drop('state_label', axis=1)
y_all = dataset_df['state_label']

# Perform the split (default is 75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_all, 
    y_all,
    test_size=0.25,     # Size of the test set (0.25 = 25% of data)
    random_state=42,     # Set seed for reproducibility
    shuffle=True         # Shuffle the data before splitting
)

#2.4 Extract features (X) and labels (y) from the training set
print("Training Features shape:", X_train.shape)
print("Training Labels shape:", y_train.shape)


#2.4 Extract features (X) and labels (y) from the testing set
print("Training Features shape:", X_test.shape)
print("Training Labels shape:", y_test.shape)

Missing values in training data:
 timestamp            0
patient_id           0
age                  0
gender               0
diastolic_bp         0
systolic_bp          0
heart_rate           0
respiratory_rate     0
oxygen_saturation    0
state_label          0
dtype: int64
Training Features shape: (9041760, 9)
Training Labels shape: (9041760,)
Training Features shape: (3013920, 9)
Training Labels shape: (3013920,)


In [15]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

def prepare_sequences(df, sequence_length=12, step=1):
    """
    Prepare sequences for LSTM model from DataFrame.
    sequence_length: number of time steps in each sequence (12 = 1 hour with 5-min intervals)
    step: number of time steps to move forward for next sequence
    """
    features = ['diastolic_bp', 'systolic_bp', 'heart_rate', 
                'respiratory_rate', 'oxygen_saturation', 'age']
    
    # Create sequences for each patient
    sequences = []
    labels = []
    
    for patient_id in df['patient_id'].unique():
        patient_data = df[df['patient_id'] == patient_id].sort_values('timestamp')
        
        # Convert gender to numeric
        gender_numeric = (patient_data['gender'] == 'M').astype(int).iloc[0]
        
        # Get patient features
        patient_sequence = patient_data[features].values
        patient_states = patient_data['state_label'].values
        
        # Add gender as a constant feature
        patient_sequence = np.column_stack([patient_sequence, 
                                          np.full(len(patient_sequence), gender_numeric)])
        
        # Create sequences
        for i in range(0, len(patient_sequence) - sequence_length + 1, step):
            sequences.append(patient_sequence[i:i + sequence_length])
            labels.append(patient_states[i + sequence_length - 1])
    
    return np.array(sequences), np.array(labels)

# Prepare the data
sequence_length = 12  # 1 hour of data
X_sequences, y_sequences = prepare_sequences(dataset_df, sequence_length)

# Split into train and validation sets
train_idx = int(0.8 * len(X_sequences))
X_train, X_val = X_sequences[:train_idx], X_sequences[train_idx:]
y_train, y_val = y_sequences[:train_idx], y_sequences[train_idx:]

# Scale the features
scaler = StandardScaler()
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])

X_train_scaled = scaler.fit_transform(X_train_reshaped)
X_val_scaled = scaler.transform(X_val_reshaped)

X_train_scaled = X_train_scaled.reshape(X_train.shape)
X_val_scaled = X_val_scaled.reshape(X_val.shape)

# Convert labels to categorical
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)

# Create and compile the model
def create_model(sequence_length, n_features):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, input_shape=(sequence_length, n_features), 
                            return_sequences=True),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')  # 4 classes (0,1,2,3)
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:

# Create and train the model
model = create_model(sequence_length, X_train.shape[-1])

# Class weights to handle imbalance
class_weights = dict(enumerate(
    1 / np.bincount(y_train) * len(y_train) / 4
))


# Train the model
history = model.fit(
    X_train_scaled, y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_val_scaled, y_val_cat),
    class_weight=class_weights,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

In [None]:
# Plotting functions
def plot_training_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    
    # Plot loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Generate and plot model diagnostics
plot_training_history(history)

# Generate predictions
y_pred = model.predict(X_val_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_val_classes = np.argmax(y_val_cat, axis=1)

# Plot confusion matrix
plot_confusion_matrix(y_val_classes, y_pred_classes)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val_classes, y_pred_classes))