# Benchmark for Beginner Track

In [1]:
# =========================================================
# 1. Import Libraries & Load Data
# =========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For our Deep Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

#For splitting the training and testing data
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows', 100)



# Some display settings for nicer graphs
plt.rcParams['figure.figsize'] = (10, 6)
sns.set_style('whitegrid')

# Load dataset
train_df = pd.read_csv('train_data.csv')
print("Data Loaded! Shape:", train_df.shape)
print(train_df.head())

# Load test data
test_df = pd.read_csv('test_data.csv')
print("Test Data Loaded! Shape:", test_df.shape)
print(test_df.head())

Data Loaded! Shape: (12055680, 16)
             timestamp                            patient_id first_name  \
0  2025-01-01 19:00:00  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
1  2025-01-01 19:00:05  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
2  2025-01-01 19:00:10  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
3  2025-01-01 19:00:15  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   
4  2025-01-01 19:00:20  b317e7ee-8af7-3e9c-3e0f-646395b8c81a  Howard613   

       last_name  age gender           address       city state  postcode  \
0  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
1  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
2  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
3  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   
4  Altenwerth646   42      M  2/58 JASPER ROAD  BENTLEIGH   VIC      3204   

   diastolic_bp  systolic_bp  heart_rate  respirato

In [2]:
feature_columns = ['age', 'diastolic_bp', 'systolic_bp', 'heart_rate',
                  'respiratory_rate', 'oxygen_saturation']
def convert_df(df):
    columns_to_drop = ['timestamp', 'patient_id', 'first_name', 'last_name', 
                   'address', 'city', 'state', 'postcode']

    # Drop the columns from the DataFrame
    df = df.drop(columns=columns_to_drop)

    feature_columns = ['age', 'diastolic_bp', 'systolic_bp', 'heart_rate',
                  'respiratory_rate', 'oxygen_saturation']
    # Encode gender
    df['gender_encoded'] = LabelEncoder().fit_transform(df['gender'])
    feature_columns.append('gender_encoded')

    # Assuming df is your dataframe and it's sorted by timestamp
    lag_features = ['heart_rate', 'systolic_bp', 'diastolic_bp']

    # Create lags
    for feature in lag_features:
        df[f'{feature}_lag1'] = df.groupby('patient_id')[feature].shift(1)
        df[f'{feature}_lag2'] = df.groupby('patient_id')[feature].shift(2)
        df[f'{feature}_lag5'] = df.groupby('patient_id')[feature].shift(5)
        df[f'{feature}_lag10'] = df.groupby('patient_id')[feature].shift(10)

    # Create differences between current and lagged values
    for feature in lag_features:
        df[f'{feature}_diff1'] = df[feature] - df[f'{feature}_lag1']
        df[f'{feature}_diff2'] = df[feature] - df[f'{feature}_lag2']
        df[f'{feature}_diff5'] = df[feature] - df[f'{feature}_lag5']
        df[f'{feature}_diff10'] = df[feature] - df[f'{feature}_lag10']

    # Add new features to feature columns
    feature_columns.extend([f'{feature}_diff1' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff2' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff5' for feature in lag_features])
    feature_columns.extend([f'{feature}_diff10' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag1' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag2' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag5' for feature in lag_features])
    feature_columns.extend([f'{feature}_lag10' for feature in lag_features])


    ## Feature lag and inference for derived features

    # Original oxygen and blood pressure derived features
    df['oxygen_delivery'] = df['heart_rate'] * df['oxygen_saturation'] * df['systolic_bp']
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = (2 * df['diastolic_bp'] + df['systolic_bp']) / 3
    df['rate_pressure_product'] = df['heart_rate'] * df['systolic_bp']

    # Define derived features for lagging
    derived_features = ['oxygen_delivery', 'pulse_pressure', 'mean_arterial_pressure', 'rate_pressure_product']

    # Create lags
    for feature in derived_features:
        df[f'{feature}_lag1'] = df.groupby('patient_id')[feature].shift(1)
        df[f'{feature}_lag2'] = df.groupby('patient_id')[feature].shift(2)
        df[f'{feature}_lag3'] = df.groupby('patient_id')[feature].shift(3)

    # Create differences
    for feature in derived_features:
        # Difference with 1 timestep ago
        df[f'{feature}_diff1'] = df[feature] - df[f'{feature}_lag1']
        # Difference with 2 timesteps ago
        df[f'{feature}_diff2'] = df[feature] - df[f'{feature}_lag2']
        # Difference with 3 timesteps ago
        df[f'{feature}_diff3'] = df[feature] - df[f'{feature}_lag3']

    # Add all new features to feature columns
    feature_columns.extend(derived_features)  # Add original derived features
    feature_columns.extend([f'{feature}_lag1' for feature in derived_features])
    feature_columns.extend([f'{feature}_lag2' for feature in derived_features])
    feature_columns.extend([f'{feature}_lag3' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff1' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff2' for feature in derived_features])
    feature_columns.extend([f'{feature}_diff3' for feature in derived_features])

    df = df.fillna(method='bfill')



    X = df[feature_columns]
    if 'state_label' in df:
        y = pd.Series(LabelEncoder().fit_transform(df['state_label']))
    else:
        y = None
    return X, y, df


def simple_preprocess(df, feature_columns, target_column='state_label', test_size=0.2, random_state=420):
    # 1. Handle missing values using backward fill (if any NaN values remain)
    df = df.fillna(method='bfill')

    # 3. Scale the features while preserving outliers
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])  # Apply scaling

    # 4. Prepare X and y
    X = df[feature_columns, 'age', 'timestamp', 'patient_id']
    y = None
    if target_column in df:
        y = pd.Series(LabelEncoder().fit_transform(df['state_label']))

    return X, y, df

X_train, y_train, train_df= convert_df(train_df)
fc = ['diastolic_bp', 'systolic_bp',
       'heart_rate', 'respiratory_rate', 'oxygen_saturation', 'gender_encoded',
       'heart_rate_diff1', 'systolic_bp_diff1', 'diastolic_bp_diff1',
       'heart_rate_diff2', 'systolic_bp_diff2', 'diastolic_bp_diff2',
       'heart_rate_diff5', 'systolic_bp_diff5', 'diastolic_bp_diff5',
       'heart_rate_diff10', 'systolic_bp_diff10', 'diastolic_bp_diff10',
       'heart_rate_lag1', 'systolic_bp_lag1', 'diastolic_bp_lag1',
       'heart_rate_lag2', 'systolic_bp_lag2', 'diastolic_bp_lag2',
       'heart_rate_lag5', 'systolic_bp_lag5', 'diastolic_bp_lag5',
       'heart_rate_lag10', 'systolic_bp_lag10', 'diastolic_bp_lag10',
       'oxygen_delivery', 'pulse_pressure', 'mean_arterial_pressure',
       'rate_pressure_product', 'oxygen_delivery_lag1', 'pulse_pressure_lag1',
       'mean_arterial_pressure_lag1', 'rate_pressure_product_lag1',
       'oxygen_delivery_lag2', 'pulse_pressure_lag2',
       'mean_arterial_pressure_lag2', 'rate_pressure_product_lag2',
       'oxygen_delivery_lag3', 'pulse_pressure_lag3',
       'mean_arterial_pressure_lag3', 'rate_pressure_product_lag3',
       'oxygen_delivery_diff1', 'pulse_pressure_diff1',
       'mean_arterial_pressure_diff1', 'rate_pressure_product_diff1',
       'oxygen_delivery_diff2', 'pulse_pressure_diff2',
       'mean_arterial_pressure_diff2', 'rate_pressure_product_diff2',
       'oxygen_delivery_diff3', 'pulse_pressure_diff3',
       'mean_arterial_pressure_diff3', 'rate_pressure_product_diff3']

KeyError: 'patient_id'

In [None]:
#X_train, y_train, train_df = simple_preprocess(train_df, fc)

In [None]:
# Perform the split (default is 75% train, 25% test) for holdout
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train,
    test_size=0.25,     # Size of the test set (0.25 = 25% of data)
    random_state=69,     # Set seed for reproducibility
    shuffle=True         # Shuffle the data before splitting
) # this is for model analysis later on if the val not being nice

#2.4 Extract features (X) and labels (y) from the training set
print("Training Features shape:", X_train.shape)
print("Training Labels shape:", y_train.shape)

In [None]:
unique_states = y_train.unique()
num_classes = len(unique_states)
print("Unique state labels:", unique_states)
print("Number of classes:", num_classes)

# Sort them to create a mapping
unique_states_sorted = sorted(unique_states)
state_to_index = {state: i for i, state in enumerate(unique_states_sorted)}
print("State to Index mapping:", state_to_index)

# Convert our training labels to numerical indices
y_train_indices = y_train.map(state_to_index)

# One-hot encode the training labels
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train_indices, num_classes=num_classes)
print("y_train_encoded shape:", y_train_encoded.shape, 
      "Example one-hot vector:", y_train_encoded[0])


In [None]:
from tensorflow.keras import backend as K

# Clear the Keras session before model creation
K.clear_session()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

def create_model(input_shape, num_classes):
    model = Sequential([
        # Input Layer
        Dense(128, activation='selu', input_shape=input_shape,
              kernel_regularizer=regularizers.l2(0.001),
              kernel_initializer='lecun_normal'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Hidden Layer
        Dense(64, activation='selu',
              kernel_regularizer=regularizers.l2(0.001),
              kernel_initializer='lecun_normal'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Output Layer
        Dense(num_classes, activation='softmax')
    ])
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Simpler callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=1e-6,
        verbose=1
    )
]

In [None]:
model = create_model(input_shape=(X_train.shape[1],), num_classes=num_classes)
history = model.fit(
    X_train, y_train_encoded,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Plotting training and validation performance
plt.figure(figsize=(12, 4))
# Plot training and validation loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title("Loss over Epochs")

# Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title("Accuracy over Epochs")

plt.show()

In [None]:
X_test, y_test, test_df = convert_df(test_df)
#X_test, y_test, test_df = simple_preprocess(test_df, fc)

#  Extract the ID column from the test DataFrame
ids = test_df['ID']
# Remove the ID column to get only the 5 vital sign features (diastolic_bp, systolic_bp, heart_rate, respiratory_rate, oxygen_saturation)
X_test_features = test_df.drop('ID', axis=1)

# Generate predictions (probabilities) for the test set using the correct feature set
pred_probabilities = model.predict(X_test)

# Convert probabilities to predicted class indices
pred_class_indices = np.argmax(pred_probabilities, axis=1)

# Map back to original state labels (using the mapping from training)
index_to_state = {v: k for k, v in state_to_index.items()}
predicted_state_labels = [index_to_state[idx] for idx in pred_class_indices]

# Create a DataFrame with the ID and predicted_label columns
predictions_df = pd.DataFrame({
    'ID': ids,
    'predicted_label': predicted_state_labels
})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions_trial.csv', index=False)
print("Predictions saved to predictions.csv")