# Objective

Author: Kate Lassiter

Battling class imbalance issues to improve DCNN precision/recall.

In [3]:
import sys
sys.path.append("../")
import joblib
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from utils.data import NormalDataset, resize, get_inverse_class_weights
from utils.utils import EarlyStopping
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter

In [None]:
# Data Tranformations
device="cpu"
csv_file_path = "/XXXX.csv"
hip_data = pd.read_csv(csv_file_path)

num_rows = hip_data.shape[0] # Add a 'group' field, dividing the dataset into 10 equal parts
group_size = num_rows // 10  # Calculate the number of rows per group
# Create an array with group labels (1 to 10) repeated for each group size
group_labels = np.repeat(np.arange(1, 11), group_size)

# If remaining rows due to integer division assign them to last group
if len(group_labels) < num_rows:
    group_labels = np.concatenate([group_labels, np.full(num_rows - len(group_labels), 10)])
hip_data['group'] = group_labels # Assign the group labels 

X = hip_data[['x', 'y', 'z']].values
y = hip_data['annotation'].values
groups = hip_data['group'].values

# Reshape the data into windows of size X
def create_windows(data, labels, groups, window_size):
    num_windows = data.shape[0] // window_size
    X_windows = data[:num_windows * window_size].reshape(num_windows, window_size, -1)
    y_windows = labels[window_size-1:num_windows * window_size:window_size]  # One label per window
    group_windows = groups[window_size-1:num_windows * window_size:window_size]  # One group per window
    return X_windows, y_windows, group_windows

In [17]:
hip_data['annotation'].value_counts()

annotation
0    1071224
1       8774
Name: count, dtype: int64

In [15]:
hip_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079998 entries, 0 to 1079997
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   timestamp   1079998 non-null  object 
 1   x           1079998 non-null  float64
 2   y           1079998 non-null  float64
 3   z           1079998 non-null  float64
 4   annotation  1079998 non-null  int64  
 5   patient_id  1079998 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 49.4+ MB


In [21]:
window_size= 1000
X_windows, y_windows, group_windows = create_windows(X, y, groups, window_size)

# Split the data into train, validation, and test sets
x_train, x_temp, y_train, y_temp, group_train, group_temp = train_test_split(
    X_windows, y_windows, group_windows, test_size=0.3, random_state=42
)

x_val, x_test, y_val, y_test, group_val, group_test = train_test_split(
    x_temp, y_temp, group_temp, test_size=0.5, random_state=42
)

# Output shapes:
print(((x_train.shape, y_train.shape, group_train.shape),
       (x_val.shape, y_val.shape, group_val.shape),
       (x_test.shape, y_test.shape, group_test.shape)))

(((755, 1000, 3), (755,), (755,)), ((162, 1000, 3), (162,), (162,)), ((162, 1000, 3), (162,), (162,)))


In [23]:
repo = 'OxWearables/ssl-wearables'
sslnet: nn.Module = torch.hub.load(repo, 'harnet30', trust_repo=True, class_num=2, pretrained=True, weights_only=False)
sslnet.to(device)

train_dataset = NormalDataset(x_train, y_train, group_train, name="training", transform=True)
val_dataset = NormalDataset(x_val, y_val, group_val, name="validation")
test_dataset = NormalDataset(x_test, y_test, group_test, name="test")

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=1,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=1,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=1,
)

Using cache found in /Users/kat/.cache/torch/hub/OxWearables_ssl-wearables_main
  pretrained_dict = torch.load(weight_path, map_location=my_device)


131 Weights loaded
training set sample count : 755
validation set sample count : 162
test set sample count : 162


In [25]:
def train(model, train_loader, val_loader, device, loss_fn, num_epoch=5, state_dict='state_dict.pt'):
    optimizer = torch.optim.Adam(model.parameters())
    early_stopping = EarlyStopping(patience=5, verbose=True, path=state_dict)
    
    for epoch in range(num_epoch):
        model.train()
        train_losses = []
        train_acces = []
        for i, (x, y, _) in enumerate(tqdm(train_loader)):
            x.requires_grad_(True)
            x = x.to(device, dtype=torch.float)
            true_y = y.to(device, dtype=torch.long)

            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits, true_y)
            loss.backward()
            optimizer.step()

            pred_y = torch.argmax(logits, dim=1)
            train_acc = torch.sum(pred_y == true_y) / pred_y.size()[0]

            train_losses.append(loss.cpu().detach())
            train_acces.append(train_acc.cpu().detach())

        val_loss, val_acc = _validate_model(model, val_loader, device, loss_fn)

        epoch_len = len(str(num_epoch))
        print_msg = (
            f"[{epoch:>{epoch_len}}/{num_epoch:>{epoch_len}}] | "
            + f"train_loss: {np.mean(train_losses):.3f} | "
            + f"train_acc: {np.mean(train_acces):.3f} | "
            + f"val_loss: {val_loss:.3f} | "
            + f"val_acc: {val_acc:.2f}"
        )

        early_stopping(val_loss, model)
        print(print_msg)

        if early_stopping.early_stop:
            print('Early stopping')
            print(f'SSLNet weights saved to {state_dict}')
            break

def _validate_model(model, val_loader, device, loss_fn):
    model.eval()
    losses = []
    acces = []
    for x, y, _ in val_loader:
        with torch.inference_mode():
            x = x.to(device, dtype=torch.float)
            true_y = y.to(device, dtype=torch.long)

            logits = model(x)
            loss = loss_fn(logits, true_y)

            pred_y = torch.argmax(logits, dim=1)
            val_acc = torch.sum(pred_y == true_y) / pred_y.size()[0]

            losses.append(loss.cpu().detach())
            acces.append(val_acc.cpu().detach())

    return np.mean(losses), np.mean(acces)

def predict(model, data_loader, device):
    from tqdm import tqdm

    predictions_list = []
    true_list = []
    pid_list = []
    model.eval()

    for i, (x, y, pid) in enumerate(tqdm(data_loader)):
        with torch.inference_mode():
            x = x.to(device, dtype=torch.float)
            logits = model(x)
            true_list.append(y)
            pred_y = torch.argmax(logits, dim=1)
            predictions_list.append(pred_y.cpu())
            pid_list.extend(pid)
    true_list = torch.cat(true_list)
    predictions_list = torch.cat(predictions_list)

    return (
        torch.flatten(true_list).numpy(),
        torch.flatten(predictions_list).numpy(),
        np.array(pid_list),
    )


## Perfect Accuracy, Poor Precision/Recall
### 1. SMOTE
### 2. Oversample Minority Class With and Without Added Random Noise

## SMOTE

In [30]:
def smote_oversample(train_loader, device):
    all_data=[]
    all_labels=[]
    for x, y, _ in train_loader:
        all_data.append(x.view(x.size(0), -1)) 
        all_labels.append(y)
    all_data =torch.cat(all_data, dim=0).cpu().numpy() 
    all_labels = torch.cat(all_labels, dim=0).cpu().numpy()
    smote= SMOTE(random_state=42, k_neighbors=1)
    X_resampled,y_resampled= smote.fit_resample(all_data, all_labels)
    X_resampled = torch.tensor(X_resampled).to(device).view(-1, 3, 1000) 
    y_resampled =torch.tensor(y_resampled).to(device)    
    resampled_dataset = TensorDataset(X_resampled, y_resampled)
    resampled_loader= DataLoader(resampled_dataset,batch_size=train_loader.batch_size,shuffle=True)
    return resampled_loader

balanced_train_loader=smote_oversample(train_loader, device)
balanced_classes=Counter([y for _, y, _ in balanced_train_loader])
print("Class distribution after SMOTE:", balanced_classes)
train(model, balanced_train_loader, val_loader, device, loss_fn)




In [29]:
for x in balanced_train_loader:
    print(x)
    break

[tensor([[[ 0.9792,  0.7696,  0.7637,  ...,  0.6142,  0.5616,  0.6234],
         [-0.7948, -0.4440, -0.1553,  ..., -0.7147, -0.8886, -0.7669],
         [-0.5895, -0.7967, -0.9871,  ..., -0.6965, -0.4968, -0.6669]],

        [[-0.5607, -0.6266, -0.6641,  ..., -0.4624, -0.3890, -0.3532],
         [-0.2344, -0.2416, -0.2372,  ..., -0.0586, -0.0021,  0.0344],
         [ 0.9527,  1.0320,  1.0962,  ...,  0.7521,  0.6956,  0.6397]],

        [[-0.9103, -0.9099, -0.9101,  ..., -0.8424, -0.6897, -0.5804],
         [ 0.3820,  0.3664,  0.3660,  ...,  0.5981,  0.4133,  0.3186],
         [-0.0053, -0.0198, -0.0217,  ..., -0.4965, -0.5230, -0.4307]],

        ...,

        [[-0.5469, -0.6025, -0.6357,  ..., -0.5009, -0.4188, -0.3749],
         [-0.1905, -0.1949, -0.1912,  ..., -0.0214,  0.0405,  0.0813],
         [ 0.9237,  0.9924,  1.0470,  ...,  0.7910,  0.7277,  0.6630]],

        [[ 1.0070,  0.7709,  0.7709,  ...,  0.6056,  0.5487,  0.6212],
         [-0.8845, -0.4847, -0.1557,  ..., -0.7781, -0

## Random Oversampling Minority Class, with and without Noise

In [31]:
repo = 'OxWearables/ssl-wearables'
sslnet: nn.Module = torch.hub.load(repo, 'harnet30', trust_repo=True, class_num=2, pretrained=True, weights_only=False)
sslnet.to(device)

Using cache found in /Users/kat/.cache/torch/hub/OxWearables_ssl-wearables_main
  pretrained_dict = torch.load(weight_path, map_location=my_device)


131 Weights loaded


Resnet(
  (feature_extractor): Sequential(
    (layer1): Sequential(
      (0): Conv1d(3, 64, kernel_size=(5,), stride=(1,), padding=(2,), bias=False, padding_mode=circular)
      (1): ResBlock(
        (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv1): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,), bias=False, padding_mode=circular)
        (conv2): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,), bias=False, padding_mode=circular)
        (relu): ReLU(inplace=True)
      )
      (2): ResBlock(
        (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv1): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,), bias=False, padding_mode=circular)
        (conv2): Conv1d(6

In [33]:
def random_oversample(train_loader, device):
    all_data=[]
    all_labels=[]
    all_metadata=[]
    for x, y, metadata in train_loader:
        all_data.append(x.view(x.size(0), -1))  
        all_labels.append(y)
        all_metadata.append(metadata)  
        all_data= torch.cat(all_data,dim=0).cpu().numpy() 
    all_labels =torch.cat(all_labels,dim=0).cpu().numpy()
    all_metadata= torch.cat(all_metadata,dim=0).cpu().numpy() 
    
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled =ros.fit_resample(all_data,all_labels)
    indices = ros.sample_indices_
    metadata_resampled= all_metadata[indices]
    X_resampled = torch.tensor(X_resampled).to(device).view(-1,3, 1000) 
    y_resampled= torch.tensor(y_resampled).to(device)
    metadata_resampled = torch.tensor(metadata_resampled).to(device)
    resampled_dataset =TensorDataset(X_resampled, y_resampled, metadata_resampled)
    resampled_loader= DataLoader(resampled_dataset, batch_size=train_loader.batch_size, shuffle=True)
    return resampled_loader

balanced_train_loader = random_oversample(train_loader, device)
balanced_classes = Counter(y.item() for _, y_batch, _ in balanced_train_loader for y in y_batch)
print("Class distribution after Random Oversampling:", balanced_classes)

class_weights = torch.tensor(get_inverse_class_weights(y_train), dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
train(sslnet, balanced_train_loader, val_loader, device, loss_fn)

Class distribution after Random Oversampling: Counter({1: 747, 0: 747})
Inverse class weights: 
[1.0107095046854082, 94.375]


100%|███████████████████████████████████████████| 12/12 [01:54<00:00,  9.50s/it]


Validation loss decreased (inf --> 1000.756531). Saving model ...
[0/5] | train_loss: 1.789 | train_acc: 0.643 | val_loss: 1000.757 | val_acc: 1.00


100%|███████████████████████████████████████████| 12/12 [02:09<00:00, 10.83s/it]


Validation loss decreased (1000.756531 --> 481.209015). Saving model ...
[1/5] | train_loss: 0.356 | train_acc: 0.821 | val_loss: 481.209 | val_acc: 1.00


100%|███████████████████████████████████████████| 12/12 [02:19<00:00, 11.63s/it]


Validation loss decreased (481.209015 --> 2.249826). Saving model ...
[2/5] | train_loss: 0.132 | train_acc: 0.823 | val_loss: 2.250 | val_acc: 0.84


100%|███████████████████████████████████████████| 12/12 [02:38<00:00, 13.25s/it]


EarlyStopping counter: 1/5
[3/5] | train_loss: 0.194 | train_acc: 0.719 | val_loss: 15.655 | val_acc: 0.33


  0%|                                                    | 0/12 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [41]:
for x in balanced_train_loader:
    print(x)
    break

[tensor([[[-0.0143, -0.0113, -0.0258,  ...,  0.0236,  0.0236,  0.0236],
         [-0.1942, -0.1114, -0.0710,  ...,  0.3786,  0.3786,  0.3786],
         [ 0.8867,  1.0207,  1.0674,  ...,  0.9193,  0.9193,  0.9193]],

        [[-0.7366, -0.7368, -0.7481,  ..., -0.7665, -0.8016, -0.7404],
         [ 0.0775,  0.0815,  0.0786,  ..., -0.0435, -0.0698, -0.0854],
         [ 0.7106,  0.7133,  0.7050,  ...,  0.7487,  0.7974,  0.7713]],

        [[-0.7366, -0.7368, -0.7481,  ..., -0.7665, -0.8016, -0.7404],
         [ 0.0775,  0.0815,  0.0786,  ..., -0.0435, -0.0698, -0.0854],
         [ 0.7106,  0.7133,  0.7050,  ...,  0.7487,  0.7974,  0.7713]],

        ...,

        [[-0.7366, -0.7368, -0.7481,  ..., -0.7665, -0.8016, -0.7404],
         [ 0.0775,  0.0815,  0.0786,  ..., -0.0435, -0.0698, -0.0854],
         [ 0.7106,  0.7133,  0.7050,  ...,  0.7487,  0.7974,  0.7713]],

        [[-0.7366, -0.7368, -0.7481,  ..., -0.7665, -0.8016, -0.7404],
         [ 0.0775,  0.0815,  0.0786,  ..., -0.0435, -0

In [48]:
# Looking for data leakage
window_size= 1000
X_windows, y_windows, group_windows = create_windows(X, y, groups, window_size)

# Split the data into train, validation, and test sets
x_train, x_temp, y_train, y_temp, group_train, group_temp = train_test_split(
    X_windows, y_windows, group_windows, test_size=0.3, random_state=42
)

x_val, x_test, y_val, y_test, group_val, group_test = train_test_split(
    x_temp, y_temp, group_temp, test_size=0.5, random_state=42
)

# Output the shapes as expected:
print(((x_train.shape, y_train.shape, group_train.shape),
       (x_val.shape, y_val.shape, group_val.shape),
       (x_test.shape, y_test.shape, group_test.shape)))

ValueError: Found input variables with inconsistent numbers of samples: [359, 359999]

In [62]:
# Fixing the Windowing 
def create_windows(X, y, groups, window_size):
    X_windows=[]
    y_windows=[]
    group_windows=[]
    num_windows=len(y)
    for i in range(num_windows):
        start_idx = i*window_size
        end_idx = start_idx+window_size
        if end_idx <=len(X):  
            X_windows.append(X[start_idx:end_idx])
            y_windows.append(y[i]) 
            group_windows.append(groups[i])  
    X_windows = np.array(X_windows)
    y_windows= np.array(y_windows)
    group_windows =np.array(group_windows)
    return X_windows, y_windows, group_windows

x_windows, y_windows, group_windows = create_windows(X, y, groups, window_size)

x_train, x_temp,y_train,y_temp, group_train, group_temp = train_test_split(
    x_windows, y_windows, group_windows, test_size=0.3, random_state=42
)

x_val, x_test,y_val, y_test, group_val, group_test = train_test_split(
    x_temp,y_temp, group_temp, test_size=0.5, random_state=42
)

# Output shapes:
print(((x_train.shape, y_train.shape, group_train.shape),
       (x_val.shape, y_val.shape, group_val.shape),
       (x_test.shape, y_test.shape, group_test.shape)))

(((251, 1000, 3), (251,), (251,)), ((54, 1000, 3), (54,), (54,)), ((54, 1000, 3), (54,), (54,)))


In [60]:
print(Counter(y_windows)) 

Counter({0: 359})


In [64]:
train_dataset = NormalDataset(x_train, y_train, group_train, name="training", transform=True)
val_dataset = NormalDataset(x_val, y_val, group_val, name="validation")
test_dataset = NormalDataset(x_test, y_test, group_test, name="test")

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=0,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=0,
)

training set sample count : 251
validation set sample count : 54
test set sample count : 54


### Random Oversampling Minority With No Noise

In [None]:

def random_oversample(train_loader, device):
    all_data=[]
    all_labels=[]
    all_metadata=[]
    for x, y, metadata in train_loader:
        all_data.append(x.view(x.size(0), -1))  
        all_labels.append(y)
        all_metadata.append(metadata)  
        all_data= torch.cat(all_data,dim=0).cpu().numpy() 
    all_labels =torch.cat(all_labels,dim=0).cpu().numpy()
    all_metadata= torch.cat(all_metadata,dim=0).cpu().numpy() 
    
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled =ros.fit_resample(all_data,all_labels)
    indices = ros.sample_indices_
    metadata_resampled= all_metadata[indices]
    X_resampled = torch.tensor(X_resampled).to(device).view(-1,3, 1000) 
    y_resampled= torch.tensor(y_resampled).to(device)
    metadata_resampled = torch.tensor(metadata_resampled).to(device)
    resampled_dataset =TensorDataset(X_resampled, y_resampled, metadata_resampled)
    resampled_loader= DataLoader(resampled_dataset, batch_size=train_loader.batch_size, shuffle=True)
    return resampled_loader

balanced_train_loader = random_oversample(train_loader, device)
balanced_classes = Counter(y.item() for _, y_batch, _ in balanced_train_loader for y in y_batch)
print("Class distribution after Random Oversampling:", balanced_classes)

class_weights = torch.tensor(get_inverse_class_weights(y_train), dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
train(sslnet, balanced_train_loader, val_loader, device, loss_fn)

### Random Oversampling Minority Class With Noise

In [None]:
# Reshape the data into windows of size X
def create_windows(data, labels, groups, window_size):
    num_windows=data.shape[0]//window_size
    X_windows=data[:num_windows*window_size].reshape(num_windows, window_size, -1)
    y_windows = labels[window_size-1:num_windows *window_size:window_size]  # One label per window
    group_windows= groups[window_size-1:num_windows *window_size:window_size]  # One group per window
    return X_windows, y_windows,group_windows

X = hip_data[['x', 'y', 'z']].values
y = hip_data['annotation'].values
groups = hip_data['group'].values

window_size = 1000
X_windows, y_windows, group_windows= create_windows(X, y, groups, window_size)
x_train, x_temp, y_train, y_temp, group_train, group_temp = train_test_split(
    X_windows, y_windows, group_windows, test_size=0.3, random_state=42
)

x_train_ros, y_train_ros = RandomOverSampler(random_state=42).fit_resample(x_train.reshape(x_train.shape[0], -1), y_train)
noise_factor = 0.1 # Slight random noise, avoid duplications for minority
x_train_ros= x_train_ros+(np.random.normal(size=x_train_ros.shape)*noise_factor)
group_train_ros = np.tile(group_train,int(np.ceil(len(x_train_ros.reshape(-1, window_size, 3))/len(group_train))))[:len(x_train_ros.reshape(-1, window_size, 3))]

x_val, x_test,y_val, y_test, group_val, group_test = train_test_split(
    x_train_ros.reshape(-1, window_size, 3), y_train_ros, group_train_ros, test_size=0.5, random_state=42
)
print(((x_train_ros.reshape(-1, window_size, 3).shape, y_train_ros.shape, group_train_ros.shape),
       (x_val.shape,y_val.shape, group_val.shape),
       (x_test.shape, y_test.shape, group_test.shape)))