We import the ICD codes and the dataset-specific codes, then encode them into embeddings using a transformer model. To visualize potential clusters to aid cleaning, we apply UMAP for dimensionality reduction and HDBSCAN for cluster detection.

In [None]:
import os
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, CanineModel, AutoModel
import torch
from torch.utils.data import DataLoader
import gc

from tqdm.auto import tqdm

import glob

icd_codes = pd.read_csv("../data/raw/icd_codes.csv")
dataset_codes = pd.read_csv("../data/raw/dataset_codes.csv")

In [20]:
# codes = list(dataset_codes.iloc[list(range(0, 10)) + list(range(100, 200)),0])
codes = list(dataset_codes.code)
ref_codes = list(icd_codes.code)

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
model = CanineModel.from_pretrained("google/canine-s")

# Tokenize character-level (automatically handled)
code_inputs = tokenizer(codes, return_tensors="pt", padding="longest", truncation=True)
reference_inputs = tokenizer(ref_codes, return_tensors="pt", padding="longest", truncation=True)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

code_inputs = {k: v.to(device) for k, v in code_inputs.items()}
reference_inputs = {k: v.to(device) for k, v in reference_inputs.items()}

# Get embeddings
with torch.no_grad():
    code_outputs = model(**code_inputs)
    reference_outputs = model(**reference_inputs)

    code_embeddings = code_outputs.last_hidden_state.mean(dim=1)
    reference_embeddings = reference_outputs.last_hidden_state.mean(dim=1)


In [None]:
# This batches our codes and then processes them so that we do not get OOM errors
def embed_and_save(
    code_list,
    model,
    tokenizer,
    output_dir="embeddings",
    batch_size=16,
    device="cuda",
    prefix="code",
):
    os.makedirs(output_dir, exist_ok=True)

    model.eval()
    dataloader = DataLoader(code_list, batch_size=batch_size)

    batch_paths = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            inputs = tokenizer(list(batch), return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings = embeddings.cpu().numpy()

            # Save this batch
            batch_path = os.path.join(output_dir, f"{prefix}_batch_{i:03d}.npy")
            np.save(batch_path, embeddings)
            batch_paths.append(batch_path)

            # Clear memory
            del inputs, outputs, embeddings
            torch.cuda.empty_cache()
            gc.collect()

    return batch_paths


In [None]:
# Embed and save; alternatively we could use cpu if OOM becomes a hassle
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
model = CanineModel.from_pretrained("google/canine-s").to(device)

# embed_and_save(codes, model, tokenizer, output_dir="embeddings", prefix="dataset") # <- 32 seconds
# embed_and_save(ref_codes, model, tokenizer, output_dir="embeddings", prefix="ref") # <- 15 minutes

In [24]:
# Load dataset code embeddings
code_files = sorted(glob.glob("embeddings/dataset_batch_*.npy"))
code_embeddings = np.vstack([np.load(f) for f in code_files])

# Load reference code embeddings
ref_files = sorted(glob.glob("embeddings/ref_batch_*.npy"))
ref_embeddings = np.vstack([np.load(f) for f in ref_files])


Now, we use cosine similarity and edit distance to compare the codes

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
sim_matrix = cosine_similarity(code_embeddings, ref_embeddings)

# Get top-3 nearest reference codes for each dataset code
top_k = 5
top_matches = np.argsort(-sim_matrix, axis=1)[:, :top_k]

for i, idxs in enumerate(top_matches):
    print(f"{codes[i]} → {[ref_codes[j] for j in idxs]}")


e14.1 → ['p772', 'p768', 'r1905', 'p762', 'r1904']
i95.9 → ['xm3375', 'xm7n03', 'nb92.41', 'xn2qh', 'nb92.20']
c53.9 → ['c50329', 'c50319', 'g9205', 'c519', 'c4359']
d64.9 → ['m7109', 'm76829', 'm71072', 'm7108', 's83144a']
a41.9 → ['c419', 'o650', 't83092s', 'c49a1', 'c49a4']
i50.0 → ['xm8qv3', 'xm4hd6', 't85623s', 's52571p', 't82867s']
g03.9 → ['1b70.01', '1b41.1z', 'ef7z', 't85630a', 'eg00']
a16.9 → ['s14121d', 't83098s', 't83092s', 's42141g', 'v4900xa']
1g40 → ['t85694d', 't85694s', 't85630d', 't85630s', '7b00.z']
ma18.0 → ['5c55.0z', '5c55.2', '5c56.00', '5c55.z', '5c55.y']
mg27 → ['7a20.1', '7a20.z', '7a20.0', '7a40.1', '7a0z']
mc82.4 → ['6c00.z', '6c00.1', '6c00.2', '6c00.0', '6b8z']
ca4z → ['w451xxs', 'w5329xa', 'w51xxxs', 'w44f1xd', 'w450xxd']
db9z → ['x782xxa', 'x3908xa', 'w3312xa', 'x12xxxs', 'x360xxs']
db99.5 → ['x779xxs', 'x779xxd', 'x75xxxs', 'x779xxa', 'x781xxa']
3a9z → ['v2491xa', 'v2459xa', 'v2421xs', 't85858s', 't82867d']
1b10.z → ['t83111s', 't83118a', 't83190s', 'v2

In [32]:
'a16.9' in ref_codes

False

Using SapBERT:

In [34]:
import numpy as np
import torch

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

# replace with your own list of entity names
all_names = codes
all_names2 = ref_codes

bs = 128 # batch size during inference
all_embs = []
all_embs2 = []
for i in tqdm(np.arange(0, len(all_names), bs)):
    toks = tokenizer.batch_encode_plus(all_names[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    all_embs.append(cls_rep.cpu().detach().numpy())

for i in tqdm(np.arange(0, len(all_names2), bs)):
    toks = tokenizer.batch_encode_plus(all_names2[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    all_embs2.append(cls_rep.cpu().detach().numpy())

all_embs = np.concatenate(all_embs, axis=0)
all_embs2 = np.concatenate(all_embs2, axis=0)


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/795 [00:00<?, ?it/s]

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
sim_matrix = cosine_similarity(all_embs, all_embs2)

# Get top-3 nearest reference codes for each dataset code
top_k = 5
top_matches = np.argsort(-sim_matrix, axis=1)[:, :top_k]

for i, idxs in enumerate(top_matches):
    print(f"{codes[i]} → {[ref_codes[j] for j in idxs]}")


e14.1 → ['e161', 'e160', 'e10a1', 'e139', 'e164']
i95.9 → ['i959', 'i952', 'i96', 'i953', 'i951']
c53.9 → ['c55', 'c52', 'c58', '5c50.9', 'c49a3']
d64.9 → ['d65', 'd62', 'd66', 'd67', 'd619']
a41.9 → ['a46', '2a85.4', '2a85.2', 'a431', '2a85.6']
i50.0 → ['qe50.0', 'i5020', 'lc50.0', 'da50.0', 'bd50.50']
g03.9 → ['g09', 'g02', 'g07', 'g01', 'g08']
a16.9 → ['a09', 'a99', 'a186', 'a191', 'a159']
1g40 → ['1g40', '1g41', '1c40', '1g80', '1g84']
ma18.0 → ['ma18.00', 'ma18.1', 'ma18.2', 'ma18.3', 'ma18.4']
mg27 → ['mg27', 'mg23', 'mg26', 'mg21', 'mg22']
mc82.4 → ['mc82.4', 'mc82.3', 'mc82.1', 'mc82.2', 'mc82.0']
ca4z → ['ca4z', 'ca7z', 'ca4y', 'ca0z', 'ca8z']
db9z → ['db9z', 'db7z', 'db6z', 'db1z', 'db3z']
db99.5 → ['db99.5', 'db99.3', 'db99.7', 'db99.8', 'db99.4']
3a9z → ['3a9z', '3a2z', '3a8z', '3a6z', '3a9y']
1b10.z → ['1b10.z', '1b11.z', '1b11.y', '1b13.z', '1b10.0']
8e4y → ['8e4y', '8e2y', '8e7y', '8e0y', '8e4a.y']
bd1z → ['bd1z', 'bd7z', 'bd9z', 'bd5z', 'bd4z']
1d01.10 → ['1d01.10', '1d

Now, we 

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(code_embeddings, code_embeddings)
np.fill_diagonal(similarities, 0)
print("Top self-similar pairs:")
for i in np.argsort(-similarities, axis=1)[:, :3]:
    print(i)


Top self-similar pairs:
[1930  564    4]
[118 102   4]
[  6 904   4]
[  4  64 979]
[7 3 2]
[118  43  20]
[  2 904 407]
[  4 118  64]
[ 659 1558   27]
[  52 1739 1539]
[116 524 119]
[1702   26  144]
[129 340  41]
[ 416 1127 1085]
[  95 1482 1648]
[1320 1711  420]
[265 675  36]
[1826  372 1054]
[ 585  978 1891]
[1751  927  391]
[1066  207 1641]
[1690   20 1219]
[ 134  714 1011]
[1672  418  284]
[ 253 1907 1013]
[ 273 1028 1070]
[1702  289  323]
[1252  977    8]
[ 342  238 1518]
[828 831  43]
[1301 1084  639]
[1515 1375  772]
[ 375 1135 1957]
[1067  981 1548]
[ 439 2058  303]
[113 258 386]
[  38   16 1107]
[ 133  549 1638]
[2042   36  615]
[38  4  7]
[ 471 1146 1089]
[  23 1672  129]
[132 913 224]
[ 53 831  29]
[ 146 1200 1885]
[1707 1010 1746]
[145 124 984]
[1114  230 1880]
[970 163 660]
[853 741 270]
[ 570 1629 1758]
[ 791 1442 1303]
[1539 1985    9]
[  43 1245  831]
[ 356 1013 1637]
[1934  968 1548]
[1642 2037   48]
[1270  589  863]
[1488  181 1554]
[1412 1072 1245]
[1833  955  378]
[ 