<h3>Tokenizer Evaluation on HAR, Sex 

Compare classification of magnitude model to tokenized magnitude with various quantisations. 

In [1]:
# Configure GPU

import sys

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

import utils

# For reproducibility
np.random.seed(42)
torch.manual_seed(42)
cudnn.benchmark = True


  from .autonotebook import tqdm as notebook_tqdm


Set up

In [2]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Compiled with CUDA:", torch.version.cuda)
print("GPU support enabled:", torch.backends.cuda.is_built())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

PyTorch version: 2.7.0+cu126
CUDA available: True
Compiled with CUDA: 12.6
GPU support enabled: True
Using device: cuda


In [21]:
import torch

device = torch.device("cpu")
print("Using device:", device)

Using device: cpu


Load data

In [41]:
import pandas as pd

#tokenized string for each participant - about a day of data. 
pid_tokens=pd.read_csv("/home/hrs675/Wearables_Tokenizer/data/pid_token_string_numbers.csv")

In [42]:
pid_tokens['pid'] = pid_tokens['pid'].str[:4]
pid_tokens.head()

Unnamed: 0,pid,token_string
0,P030,1 b 1 b 3 bb 4 b 3 bbb 1 bb 4 b 2 b 1 bb 4 bb...
1,P036,kjfgghhjlhhhihhighggjigheddddddcdefefeekkgjjhi...
2,P113,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbcbbbebbbbb...
3,P048,cbb 2 gebbbb 1 bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb...
4,P099,1 c 1 cbbcbbbbbbccccccccccccccccccccccccccccc...


In [43]:
metadata= pd.read_csv("/data/UKBB/kyra/capture24/metadata.csv")

In [44]:
#combine meta data and token strings]
meta_tokens= pd.merge(metadata, pid_tokens, on='pid', how='inner')
meta_tokens.head()

Unnamed: 0,pid,age,sex,token_string
0,P001,38-52,F,bb 3 bbbbbbbbb 1 bbbbbbbbbbbbbbbbbbbbbbbbbbbbb...
1,P002,30-37,F,cccbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb...
2,P003,30-37,F,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb...
3,P004,53+,F,21 b 3 ebbbbbbbbbcbbbbbbbbbcbbbbbbbdgbbbbcbbb...
4,P005,38-52,F,cccccccccccccccccccccccccccccccccccccccccccccc...


In [45]:
# Save the combined meta_tokens DataFrame to CSV
meta_tokens.to_csv("/home/hrs675/Wearables_Tokenizer/data/meta_tokens.csv", index=False)

Train/ Test split

In [46]:
X= meta_tokens['token_string']
Y=meta_tokens['sex']

In [47]:
pid=meta_tokens['pid']

In [54]:
from sklearn.model_selection import train_test_split

X = meta_tokens['token_string']
Y = meta_tokens['sex']

X_train, X_test, Y_train, Y_test, pid_train, pid_test = train_test_split(
    X, Y, pid, test_size=0.2, random_state=42, stratify=Y  # stratify if classification
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (120,)
Shape of X_test: (31,)


In [53]:
# Hold out participants P101-P151 for testing (51 participants)
test_ids = [f'P{i}' for i in range(101,152)]
mask_test = np.isin(pid, test_ids)
mask_train = ~mask_test
X_train, Y_train, pid_train = \
    X[mask_train], Y[mask_train], pid[mask_train]
X_test, Y_test, pid_test = \
    X[mask_test], Y[mask_test], pid[mask_test]
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (100,)
Shape of X_test: (51,)


<h3> Sex

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, nhead=8, num_layers=6, num_classes=2, max_len=2048, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(max_len, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        positions = torch.arange(x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), x.size(1))
        x = self.embedding(x) + self.pos_embedding(positions)
        x = self.transformer(x)
        x = self.norm(x)
        x = x.mean(dim=1)
        x = self.dropout(x)
        return self.fc(x)

In [50]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("trained_tokenizer.json")

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

Y=meta_tokens['sex']

# Encode labels
le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train)
Y_test_enc = le.transform(Y_test)

# Encode token strings to token ids
def encode_token_string(tokenizer, s, max_len=512):
    ids = tokenizer.encode(s).ids
    if len(ids) < max_len:
        ids += [0] * (max_len - len(ids))  # pad
    else:
        ids = ids[:max_len]
    return ids

X_train_ids = [encode_token_string(tokenizer, s) for s in X_train]
X_test_ids = [encode_token_string(tokenizer, s) for s in X_test]

# PyTorch Dataset
class TokenDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.Y = torch.tensor(Y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

train_ds = TokenDataset(X_train_ids, Y_train_enc)
test_ds = TokenDataset(X_test_ids, Y_test_enc)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32)

# Model, optimizer, loss

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(Y_train_enc), y=Y_train_enc)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Model, optimizer, loss
model = TransformerClassifier(vocab_size).to(device)  # use your improved model here
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
# Training loop

EPOCHS = 6  # Try more epochs
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)  # Try a higher learning rate
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} loss: {total_loss/len(train_dl):.4f}")

# Evaluation
model.eval()
all_preds, all_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(yb.numpy())

print("Accuracy:", accuracy_score(all_true, all_preds))
print(classification_report(all_true, all_preds, target_names=le.classes_))

Epoch 1 loss: 2.7533
Epoch 2 loss: 2.1466
Epoch 3 loss: 1.0185
Epoch 4 loss: 0.8361
Epoch 5 loss: 0.7952
Epoch 6 loss: 0.7591
Accuracy: 0.6451612903225806
              precision    recall  f1-score   support

           F       0.65      1.00      0.78        20
           M       0.00      0.00      0.00        11

    accuracy                           0.65        31
   macro avg       0.32      0.50      0.39        31
weighted avg       0.42      0.65      0.51        31



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<h3> Age

In [31]:
Y=meta_tokens['age']

In [32]:
# Hold out participants P101-P151 for testing (51 participants)
test_ids = [f'P{i}' for i in range(101,152)]
mask_test = np.isin(pid, test_ids)
mask_train = ~mask_test
X_train, Y_train, pid_train = \
    X[mask_train], Y[mask_train], pid[mask_train]
X_test, Y_test, pid_test = \
    X[mask_test], Y[mask_test], pid[mask_test]
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (100,)
Shape of X_test: (51,)


In [39]:
# ...existing code...

# 1. Set Y to age
Y = meta_tokens['age']

# 2. Train/test split (already done above, but repeat for clarity)
X_train, Y_train, pid_train = X[mask_train], Y[mask_train], pid[mask_train]
X_test, Y_test, pid_test = X[mask_test], Y[mask_test], pid[mask_test]

# 3. Encode token strings to token ids (reuse your function)
X_train_ids = [encode_token_string(tokenizer, s) for s in X_train]
X_test_ids = [encode_token_string(tokenizer, s) for s in X_test]

# ...existing code...

# 4. PyTorch Dataset for regression
class TokenDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.Y = torch.tensor(pd.to_numeric(Y).astype(np.float32).values, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# Re-create DataLoaders for regression!
train_ds = TokenDataset(X_train_ids, Y_train)
test_ds = TokenDataset(X_test_ids, Y_test)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32)

# ...existing code...

# 5. Update model output for regression
class BetterTokenTransformerRegressor(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, nhead=8, num_layers=6, max_len=2048, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(max_len, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(embed_dim, 1)  # Output 1 value for regression

    def forward(self, x):
        positions = torch.arange(x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), x.size(1))
        x = self.embedding(x) + self.pos_embedding(positions)
        x = self.transformer(x)
        x = self.norm(x)
        x = x.mean(dim=1)
        x = self.dropout(x)
        return self.fc(x).squeeze(-1)  # shape: (batch,)

# 6. Train the model
vocab_size = tokenizer.get_vocab_size()
model = BetterTokenTransformerRegressor(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

EPOCHS = 6
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} loss: {total_loss/len(train_dl):.4f}")

# 7. Evaluate
model.eval()
all_preds, all_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(yb.numpy())

from sklearn.metrics import mean_absolute_error, r2_score
print("MAE:", mean_absolute_error(all_true, all_preds))
print("R2:", r2_score(all_true, all_preds))
# ...existing code...

ValueError: Unable to parse string "38-52" at position 0