In [1]:
# Imports
import pickle
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load data
X_filePath = '../../../../../data/processedData/X_Admin_Targets_combinedBERT.pkl'
with open(X_filePath, 'rb') as f:
    data = pickle.load(f)

In [5]:
procedure_sequences = []
for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        if 'procedures' in record:
            procedure_sequences.append(record['procedures'])

print(procedure_sequences)

[['pcs_549'], ['pcs_549'], [], ['pcs_549', 'pcs_549'], ['pcs_549', 'pcs_549', 'pcs_549'], ['pcs_456', 'pcs_545'], ['pcs_893'], [], [], [], ['pcs_006', 'pcs_360', 'pcs_004', 'pcs_004', 'pcs_372', 'pcs_885', 'pcs_004'], [], ['pcs_451'], [], [], ['pcs_020'], ['pcs_866', 'pcs_867'], ['pcs_756'], ['pcs_735'], ['pcs_457'], [], [], ['pcs_577', 'pcs_401', 'pcs_685', 'pcs_674', 'pcs_655', 'pcs_703', 'pcs_565', 'pcs_174'], ['pcs_389', 'pcs_991', 'pcs_549'], [], ['pcs_542', 'pcs_549', 'pcs_549', 'pcs_389'], ['pcs_598', 'pcs_877', 'pcs_563', 'pcs_563'], [], [], [], [], [], [], ['pcs_395', 'pcs_005', 'pcs_004', 'pcs_004'], [], [], [], [], ['pcs_422'], [], [], [], [], [], [], [], [], [], ['pcs_836', 'pcs_044'], [], ['pcs_360', 'pcs_006', 'pcs_004', 'pcs_372', 'pcs_885', 'pcs_004'], ['pcs_006', 'pcs_360', 'pcs_372', 'pcs_004', 'pcs_004', 'pcs_885'], ['pcs_361', 'pcs_361'], ['pcs_372', 'pcs_885'], [], [], [], ['pcs_885', 'pcs_833', 'pcs_885', 'pcs_885'], ['pcs_841', 'pcs_833', 'pcs_776', 'pcs_389'], [

In [6]:
from gensim.models import Word2Vec

# Train Word2Vec model
model = Word2Vec(sentences=procedure_sequences, vector_size=100, window=5, min_count=1, sg=1, epochs=50)

# Save or load the model
model.save("procedure_word2vec.model")
# To reload the model later
# model = Word2Vec.load("procedure_word2vec.model")



In [7]:
# Get the embedding for a specific procedure
pcs_549_embedding = model.wv['pcs_549']

print(pcs_549_embedding)  # Output: a vector of shape (100,) or the specified vector_size


[ 0.12507811 -0.01357266  0.01869522  0.14039353  0.63252723  0.01502571
  0.26399842  0.38741416 -0.02682849  0.40282336  0.3799061  -0.02194242
 -0.4123789   0.12403336  0.16870461 -0.14907822  0.04965076 -0.26423773
 -0.09401745 -0.5258702   0.10427132 -0.13181019  0.51169556  0.01599191
 -0.10147692  0.08208602 -0.39787456  0.11947596 -0.51482874  0.10917711
  0.8871573   0.03815945  0.30097166 -0.19585419  0.22125186  0.24824399
  0.10886615 -0.04488536 -0.1663236   0.0920204   0.24709798 -0.4883173
 -0.4452676   0.29190633  0.0133641  -0.08257803  0.07568514  0.0600797
  0.15944129 -0.08451862  0.3206894  -0.06740846 -0.13321002 -0.06490457
  0.07681157 -0.06611727 -0.35810933 -0.30497926 -0.21817605 -0.09164032
  0.30058965  0.20289567  0.05278697  0.09023628 -0.33829138  0.20379364
  0.2311802   0.5479083  -0.5422826   0.69728976 -0.17970961  0.46424016
  0.0941702   0.37453535  0.23645493  0.30077273 -0.19465448 -0.21605884
 -0.13511819 -0.11034152 -0.1253325  -0.11218584 -0.2

In [8]:
for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        if 'procedures' in record:
            # Replace procedures with their embeddings
            record['procedures_embedding'] = [model.wv[proc] for proc in record['procedures']]


In [9]:
def explore_nested_dict(data, top_key):
    if top_key in data:
        nested_dict = data[top_key]
        print(f"Top-level Key: {top_key}")
        print(f"Keys in nested dictionary: {nested_dict.keys()}")
        
        # Inspect one of the nested keys in detail
        for nested_key in nested_dict:
            print(f"\nNested Key: {nested_key}")
            print(f"Type of value: {type(nested_dict[nested_key])}")
            print(f"Value sample: {nested_dict[nested_key].keys()}")
    else:
        print(f"Key {top_key} not found in the data.")


top_key_to_inspect = 10000032

explore_nested_dict(data, top_key_to_inspect)

Top-level Key: 10000032
Keys in nested dictionary: dict_keys([22595853, 22841357, 29079034])

Nested Key: 22595853
Type of value: <class 'dict'>
Value sample: dict_keys(['diagnoses', 'procedures', 'drugs', 'admitdate', 'timespent', 'admission_type', 'admission_location', 'insurance', 'marital_status', 'hospital_expire_flag', 'days_to_next_admission', 'readmitted_30', 'readmitted_60', 'combined_embedding', 'procedures_embedding'])

Nested Key: 22841357
Type of value: <class 'dict'>
Value sample: dict_keys(['diagnoses', 'procedures', 'drugs', 'admitdate', 'timespent', 'admission_type', 'admission_location', 'insurance', 'marital_status', 'hospital_expire_flag', 'days_to_next_admission', 'readmitted_30', 'readmitted_60', 'combined_embedding', 'procedures_embedding'])

Nested Key: 29079034
Type of value: <class 'dict'>
Value sample: dict_keys(['diagnoses', 'procedures', 'drugs', 'admitdate', 'timespent', 'admission_type', 'admission_location', 'insurance', 'marital_status', 'hospital_expir

In [10]:
# Save the updated dictionary with embeddings
output_file_path = '../../../../../data/processedData/X_RNN.pkl'
with open(output_file_path, 'wb') as f:
    pickle.dump(data, f)

print(f"Updated dictionary saved to {output_file_path}")

Updated dictionary saved to ../../../../../data/processedData/X_RNN.pkl
