In [None]:
%matplotlib inline
import pandas as pd 
import numpy as np

import torch 
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


In [14]:

file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
heart_csv_df = pd.read_csv(file_url)

heart_csv_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,52,1,1,118,186,0,2,190,0,0.0,2,0,fixed,0
299,43,0,4,132,341,1,2,136,1,3.0,2,0,reversible,1
300,65,1,4,135,254,0,2,127,0,2.8,2,1,reversible,1
301,48,1,4,130,256,1,2,150,1,0.0,1,2,reversible,1


In [15]:

class ClassifierDataset (torch.utils.data.Dataset):    
    def __init__(self, raw_df, label_column):
        df_copy = raw_df.copy()

        # first, set aside the labels
        self.labels_ndarray = df_copy.pop(label_column).values

        # TODO consider asserting about dtypes (ie only numerics at this point)

        # process columns to normalize values 
        scaler = preprocessing.MinMaxScaler()
        self.features_ndarray = scaler.fit_transform(df_copy)
        
    def __len__(self):
        return self.features_ndarray.shape[0]
            
    def __getitem__(self, idx):
        
        features_tensor = torch.tensor(self.features_ndarray[idx], dtype=torch.float32)
        label_tensor = torch.tensor(self.labels_ndarray[idx], dtype=torch.float32)

        return features_tensor, label_tensor 
    
    def get_feature_count(self):
        return len(self.features_ndarray[0])


    # takes an array of dataframes and an encoder
    @staticmethod
    def onehot_encode_datafames(df_array):
        unioned_df = pd.concat(df_array)
        union_categorical_cols = unioned_df.select_dtypes(exclude=['number']).columns
        
        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', )
        ohe = ohe.fit( unioned_df[union_categorical_cols] )

        processed_df_array = []
        for df in df_array:
            # first transform using the encoder that was fit on the unioned data
            encoded_values = ohe.transform( df[union_categorical_cols] )
            # and make a dataframe from that
            encoded_value_features = ohe.get_feature_names_out()
            encoded_df = pd.DataFrame(encoded_values, columns=encoded_value_features)
            # drop the encoded features from the original df and concat the encodings
            df_processed = pd.concat([df.drop(columns=union_categorical_cols), encoded_df], axis=1)
            # and finally, set it aside
            processed_df_array.append(df_processed)

        return processed_df_array

In [16]:
## TODO consider skip last!


[encoded_df] = ClassifierDataset.onehot_encode_datafames( [heart_csv_df] )

overall_ds = ClassifierDataset(encoded_df, "target")

train_ds, test_ds = random_split(overall_ds, [.75, .25])

batch_size = int(len(train_ds) / 10)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

print(f'{len(train_ds)} training records in with batch size {batch_size}, {len(test_ds)} records for test')

first_training_record, _ = train_ds[0]
num_features = first_training_record.shape[-1]
print(f'datasets have {num_features} features')


228 training records in with batch size 22, 75 records for test
datasets have 17 features


In [None]:
DROPOUT_RATE_01 = .2

class HeartRateNN(nn.Module):
    def __init__(self, num_feature_columns):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(num_feature_columns, 32),
            nn.Dropout(p=DROPOUT_RATE_01), 
            nn.GELU(),
            nn.Linear(32, 16),
            nn.Dropout(p=DROPOUT_RATE_01), 
            nn.GELU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.linear_stack(x)
    

first_training_record, _ = train_ds[0]
num_features = first_training_record.shape[-1]

model = HeartRateNN(num_features)
print(model)

In [None]:


SEED = 123
torch.manual_seed(SEED)


if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

first_training_record, _ = train_ds[0]
num_feature_columns = first_training_record.shape[-1]

model = HeartRateNN( num_feature_columns )
model.to(device)

loss_fn   = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 300

for epoch in range(num_epochs):
    epoch_correct_count, epoch_pred_count = 0, 0
    for X, y in train_dataloader:

        X = X.to(device)
        y = y.to(device)

        y_pred = model(X)
        y_pred = y_pred.reshape(y.shape)
        loss = loss_fn(y_pred, y)

        y_pred_guess = torch.round(y_pred)
        batch_num_correct = (y == y_pred_guess).sum()
        epoch_correct_count += batch_num_correct
        epoch_pred_count += len(y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], {epoch_correct_count} of {epoch_pred_count} correct {(100*epoch_correct_count/epoch_pred_count):.1f} %")



In [None]:

model.eval()  # Set the model to evaluation mode

# Define your loss function (e.g., Binary Cross-Entropy)
criterion = nn.BCEWithLogitsLoss()  # Commonly used for binary classification

# Lists to store results
test_losses = []
all_preds = []
all_labels = []

# Iterate over the test batches
with torch.no_grad():  # Disable gradient calculations
    for inputs, labels in test_dataloader:

        # just make this an array of the labels (instead array of arrays with one element)
        labels = labels.reshape(-1)

        y_pred = model(inputs)
        y_pred = y_pred.reshape(labels.shape)  # make sure it matches

        loss = criterion(y_pred, labels)
        test_losses.append(loss.item())

        y_pred_guess = torch.round(y_pred)
        
        all_preds.extend(y_pred_guess.numpy())
        all_labels.extend(labels.numpy())

# Calculate overall metrics
avg_test_loss = np.mean(test_losses)
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Average Test Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")