# Benchmark for Beginner Track

In [1]:
# =========================================================
# 1. Import Libraries & Load Data
# =========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For our Deep Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

#For splitting the training and testing data
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


# Some display settings for nicer graphs
plt.rcParams['figure.figsize'] = (10, 6)
sns.set_style('whitegrid')

# Load dataset
train_df = pd.read_csv('train_data.csv')
print("Data Loaded! Shape:", train_df.shape)
print(train_df.head())

# Load test data
test_df = pd.read_csv('test_data.csv')
print("Test Data Loaded! Shape:", test_df.shape)
print(test_df.head())

Data Loaded! Shape: (12055680, 16)
             timestamp                            patient_id first_name  \
0  2025-01-01 19:00:00  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
1  2025-01-01 19:00:05  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
2  2025-01-01 19:00:10  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
3  2025-01-01 19:00:15  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
4  2025-01-01 19:00:20  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   

       last_name  age gender           address       city state  postcode  \
0  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
1  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
2  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
3  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
4  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   

   diastolic_bp  systolic_bp  heart_rate  respirato

In [None]:
feature_columns = ['age', 'diastolic_bp', 'systolic_bp', 'heart_rate',
                  'respiratory_rate', 'oxygen_saturation']
def convert_df(df):
    feature_columns = ['age', 'diastolic_bp', 'systolic_bp', 'heart_rate',
                  'respiratory_rate', 'oxygen_saturation']
    # Encode gender
    df['gender_encoded'] = LabelEncoder().fit_transform(df['gender'])
    feature_columns.append('gender_encoded')

    # Assuming df is your dataframe and it's sorted by timestamp
    lag_features = ['heart_rate', 'systolic_bp', 'diastolic_bp']

    # Create lags
    for feature in lag_features:
        df[f'{feature}_lag1'] = df.groupby('patient_id')[feature].shift(1)
        df[f'{feature}_lag2'] = df.groupby('patient_id')[feature].shift(2)
        df[f'{feature}_lag5'] = df.groupby('patient_id')[feature].shift(5)
        df[f'{feature}_lag10'] = df.groupby('patient_id')[feature].shift(10)

    # Create differences between current and lagged values
    for feature in lag_features:
        df[f'{feature}_diff1'] = df[feature] - df[f'{feature}_lag1']
        df[f'{feature}_diff2'] = df[feature] - df[f'{feature}_lag2']
        df[f'{feature}_diff5'] = df[feature] - df[f'{feature}_lag5']
        df[f'{feature}_diff10'] = df[feature] - df[f'{feature}_lag10']

    # Add new features to feature columns
    feature_columns.extend([f'{feature}_diff1' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff2' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff5' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff10' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag1' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag2' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag5' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag10' for feature in lag_features])


    ## Feature lag and inference for derived features

    # Original oxygen and blood pressure derived features
    df['oxygen_delivery'] = df['heart_rate'] * df['oxygen_saturation'] * df['systolic_bp']
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = (2 * df['diastolic_bp'] + df['systolic_bp']) / 3
    df['rate_pressure_product'] = df['heart_rate'] * df['systolic_bp']

    # Define derived features for lagging
    derived_features = ['oxygen_delivery', 'pulse_pressure', 'mean_arterial_pressure', 'rate_pressure_product']

    # Create lags
    for feature in derived_features:
        df[f'{feature}_lag1'] = df.groupby('patient_id')[feature].shift(1)
        df[f'{feature}_lag2'] = df.groupby('patient_id')[feature].shift(2)
        df[f'{feature}_lag3'] = df.groupby('patient_id')[feature].shift(3)

    # Create differences
    for feature in derived_features:
        # Difference with 1 timestep ago
        df[f'{feature}_diff1'] = df[feature] - df[f'{feature}_lag1']
        # Difference with 2 timesteps ago
        df[f'{feature}_diff2'] = df[feature] - df[f'{feature}_lag2']
        # Difference with 3 timesteps ago
        df[f'{feature}_diff3'] = df[feature] - df[f'{feature}_lag3']

    # Add all new features to feature columns
    feature_columns.extend(derived_features)  # Add original derived features
    feature_columns.extend([f'{feature}_lag1' for feature in derived_features])
    feature_columns.extend([f'{feature}_lag2' for feature in derived_features])
    feature_columns.extend([f'{feature}_lag3' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff1' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff2' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff3' for feature in derived_features])

    df = df.fillna(method='bfill')



    X = df[feature_columns]
    if 'state_label' in df:
        y = pd.Series(LabelEncoder().fit_transform(df['state_label']))
    else:
        y = None
    return X, y, df


def simple_preprocess(df, target_column='state_label', test_size=0.2, random_state=42):
    # 1. Handle missing values using backward fill (as you've done in convert_df)
    df = df.fillna(method='bfill')  # Backward fill if any remaining NaN values
    
    # 2. Scale the features (excluding the target column) while preserving outliers
    scaler = StandardScaler()
    features = [col for col in df.columns if col != target_column]  # Exclude the target column
    df[features] = scaler.fit_transform(df[features])  # Apply scaling
    
    # 3. Split the data into features and target
    X = df[features]  # Features
    y = df[target_column]  # Target
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    return X_train, X_val, y_train, y_val, scaler, features

X_train, y_train, train_df = convert_df(train_df)
X_train, X_val, y_train, y_val, scaler, features = simple_preprocess(train_df, target_column='state_label')

In [None]:
# Perform the split (default is 75% train, 25% test) for holdout
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train,
    test_size=0.25,     # Size of the test set (0.25 = 25% of data)
    random_state=69,     # Set seed for reproducibility
    shuffle=True         # Shuffle the data before splitting
) # this is for model analysis later on if the val not being nice

#2.4 Extract features (X) and labels (y) from the training set
print("Training Features shape:", X_train.shape)
print("Training Labels shape:", y_train.shape)

In [None]:
unique_states = y_train.unique()
num_classes = len(unique_states)
print("Unique state labels:", unique_states)
print("Number of classes:", num_classes)

# Sort them to create a mapping
unique_states_sorted = sorted(unique_states)
state_to_index = {state: i for i, state in enumerate(unique_states_sorted)}
print("State to Index mapping:", state_to_index)

# Convert our training labels to numerical indices
y_train_indices = y_train.map(state_to_index)

# One-hot encode the training labels
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train_indices, num_classes=num_classes)
print("y_train_encoded shape:", y_train_encoded.shape, 
      "Example one-hot vector:", y_train_encoded[0])


In [None]:
from tensorflow.keras import backend as K

# Clear the Keras session before model creation
K.clear_session()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Model Definition
model = Sequential()

# Input Layer with L2 Regularization and Batch Normalization
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],),
                kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Hidden Layer 2
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Hidden Layer 3
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Output Layer for Multi-class Classification
model.add(Dense(num_classes, activation='softmax'))

# Compile the Model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,  # Stop after 5 epochs of no improvement
    restore_best_weights=True,  # Roll back to the best weights
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,  # Reduce the learning rate by half
    patience=3,  # Wait for 3 epochs of no improvement
    min_lr=1e-5,  # Set a lower bound on the learning rate
    verbose=1
)

# Summary of the Model
model.summary()

In [None]:
# Train the Model
history = model.fit(
    X_train, y_train_encoded,
    validation_split=0.3,
    epochs=100,  # You can adjust this
    batch_size=128,  # You can experiment with different batch sizes
    callbacks=[early_stopping, reduce_lr]
)

In [None]:
# Plotting training and validation performance
plt.figure(figsize=(12, 4))

# Plot training and validation loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title("Loss over Epochs")

# Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title("Accuracy over Epochs")

plt.show()

In [None]:
X_test, y_test, test_df = convert_df(test_df)
#  Extract the ID column from the test DataFrame
ids = test_df['ID']
# Remove the ID column to get only the 5 vital sign features (diastolic_bp, systolic_bp, heart_rate, respiratory_rate, oxygen_saturation)
X_test_features = test_df.drop('ID', axis=1)

# Generate predictions (probabilities) for the test set using the correct feature set
pred_probabilities = model.predict(X_test)

# Convert probabilities to predicted class indices
pred_class_indices = np.argmax(pred_probabilities, axis=1)

# Map back to original state labels (using the mapping from training)
index_to_state = {v: k for k, v in state_to_index.items()}
predicted_state_labels = [index_to_state[idx] for idx in pred_class_indices]

# Create a DataFrame with the ID and predicted_label columns
predictions_df = pd.DataFrame({
    'ID': ids,
    'predicted_label': predicted_state_labels
})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions_trial.csv', index=False)
print("Predictions saved to predictions.csv")

In [None]:
X_train.columns