We import the ICD codes and the dataset-specific codes, then encode them into embeddings using a transformer model. To visualize potential clusters to aid cleaning, we apply UMAP for dimensionality reduction and HDBSCAN for cluster detection.

In [1]:
import os
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, CanineModel, AutoModel
import torch
from torch.utils.data import DataLoader
import gc

import glob

from tqdm.auto import tqdm

icd_codes = pd.read_csv("../data/raw/icd_codes.csv")
dataset_codes = pd.read_csv("../data/raw/dataset_codes.csv")

In [2]:
# codes = list(dataset_codes.iloc[list(range(0, 10)) + list(range(100, 200)),0])
codes = list(dataset_codes.code)
ref_codes = list(icd_codes.code)

Using SapBERT:

In [None]:
# From https://huggingface.co/cambridgeltl/SapBERT-from-PubMedBERT-fulltext
tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

bs = 128 # batch size during inference
dataset_embeddings = []
reference_embeddings = []
for i in tqdm(np.arange(0, len(codes), bs)):
    toks = tokenizer.batch_encode_plus(codes[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    dataset_embeddings.append(cls_rep.cpu().detach().numpy())

for i in tqdm(np.arange(0, len(ref_codes), bs)):
    toks = tokenizer.batch_encode_plus(ref_codes[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    reference_embeddings.append(cls_rep.cpu().detach().numpy())

dataset_embeddings = np.concatenate(dataset_embeddings, axis=0)
reference_embeddings = np.concatenate(reference_embeddings, axis=0)

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/883 [00:00<?, ?it/s]

In [None]:
np.save("embeddings/dataset_embeddings.npy", dataset_embeddings)
np.save("embeddings/reference_embeddings.npy", reference_embeddings)

# # Load dataset code embeddings
# code_file = sorted(glob.glob("embeddings/dataset_embeddings.npy"))
# code_embeddings = np.vstack([np.load(f) for f in code_file])

# # Load reference code embeddings
# references_file = sorted(glob.glob("embeddings/reference_embeddings.npy"))
# reference_embeddings = np.vstack([np.load(f) for f in references_file])


Now, we use cosine similarity and edit distance to compare the codes

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
sim_matrix = cosine_similarity(all_embs, all_embs2)

# Get top-3 nearest reference codes for each dataset code
top_k = 5
top_matches = np.argsort(-sim_matrix, axis=1)[:, :top_k]

for i, idxs in enumerate(top_matches):
    print(f"{codes[i]} → {[ref_codes[j] for j in idxs]}")


e14.1 → ['e161', 'e160', 'e10a1', 'e139', 'e164']
i95.9 → ['i959', 'i952', 'i96', 'i953', 'i951']
c53.9 → ['c55', 'c52', 'c58', '5c50.9', 'c49a3']
d64.9 → ['d65', 'd62', 'd66', 'd67', 'd619']
a41.9 → ['a46', '2a85.4', '2a85.2', 'a431', '2a85.6']
i50.0 → ['qe50.0', 'i5020', 'lc50.0', 'da50.0', 'bd50.50']
g03.9 → ['g09', 'g02', 'g07', 'g01', 'g08']
a16.9 → ['a09', 'a99', 'a186', 'a191', 'a159']
1g40 → ['1g40', '1g41', '1c40', '1g80', '1g84']
ma18.0 → ['ma18.00', 'ma18.1', 'ma18.2', 'ma18.3', 'ma18.4']
mg27 → ['mg27', 'mg23', 'mg26', 'mg21', 'mg22']
mc82.4 → ['mc82.4', 'mc82.3', 'mc82.1', 'mc82.2', 'mc82.0']
ca4z → ['ca4z', 'ca7z', 'ca4y', 'ca0z', 'ca8z']
db9z → ['db9z', 'db7z', 'db6z', 'db1z', 'db3z']
db99.5 → ['db99.5', 'db99.3', 'db99.7', 'db99.8', 'db99.4']
3a9z → ['3a9z', '3a2z', '3a8z', '3a6z', '3a9y']
1b10.z → ['1b10.z', '1b11.z', '1b11.y', '1b13.z', '1b10.0']
8e4y → ['8e4y', '8e2y', '8e7y', '8e0y', '8e4a.y']
bd1z → ['bd1z', 'bd7z', 'bd9z', 'bd5z', 'bd4z']
1d01.10 → ['1d01.10', '1d

Now, we 

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(code_embeddings, code_embeddings)
np.fill_diagonal(similarities, 0)
print("Top self-similar pairs:")
for i in np.argsort(-similarities, axis=1)[:, :3]:
    print(i)


Top self-similar pairs:
[1930  564    4]
[118 102   4]
[  6 904   4]
[  4  64 979]
[7 3 2]
[118  43  20]
[  2 904 407]
[  4 118  64]
[ 659 1558   27]
[  52 1739 1539]
[116 524 119]
[1702   26  144]
[129 340  41]
[ 416 1127 1085]
[  95 1482 1648]
[1320 1711  420]
[265 675  36]
[1826  372 1054]
[ 585  978 1891]
[1751  927  391]
[1066  207 1641]
[1690   20 1219]
[ 134  714 1011]
[1672  418  284]
[ 253 1907 1013]
[ 273 1028 1070]
[1702  289  323]
[1252  977    8]
[ 342  238 1518]
[828 831  43]
[1301 1084  639]
[1515 1375  772]
[ 375 1135 1957]
[1067  981 1548]
[ 439 2058  303]
[113 258 386]
[  38   16 1107]
[ 133  549 1638]
[2042   36  615]
[38  4  7]
[ 471 1146 1089]
[  23 1672  129]
[132 913 224]
[ 53 831  29]
[ 146 1200 1885]
[1707 1010 1746]
[145 124 984]
[1114  230 1880]
[970 163 660]
[853 741 270]
[ 570 1629 1758]
[ 791 1442 1303]
[1539 1985    9]
[  43 1245  831]
[ 356 1013 1637]
[1934  968 1548]
[1642 2037   48]
[1270  589  863]
[1488  181 1554]
[1412 1072 1245]
[1833  955  378]
[ 