### Single Word2Vec

- singular w2v embedding for diagnosis, procedures, and drugs

In [2]:
# imports
import pickle
from gensim.models import Word2Vec

In [None]:
import pickle

# Load data
X_filePath = '../../../data/preprocessed_X_visit_over3.pkl'
with open(X_filePath, 'rb') as f:
    data = pickle.load(f)

# Prepare the Word2Vec input
corpus = []
for subject_id, visits in data.items():
    for hadm_id, features in visits.items():
        currSequence = features['diagnoses'] + features['procedures'] + features['drugs']
        corpus.append(currSequence)

# check first few sequences
print("Sample sequences for Word2Vec:")

for i in range(5):
    print(corpus[i])


Sample sequences for Word2Vec:
['d_572', 'd_789', 'd_571', 'd_070', 'd_496', 'd_296', 'd_309', 'pcs_549', 'p_NACLFLUSH', 'p_SPIR25', 'p_RALT400', 'p_ALBU17H', 'p_FURO20', 'p_MICROK10', 'p_NICO14P', 'p_HEPA5I', 'p_SPIR25', 'p_IPRA2H', 'p_INFL0.5LF', 'p_TRUV200/300', 'p_FURO40', 'p_APAP500']
['d_070', 'd_789', 'd_287', 'd_276', 'd_496', 'd_571', 'd_305', 'pcs_549', 'p_BACTDS', 'p_TIOT', 'p_RIFA550', 'p_APAP500', 'p_ALBU25', 'p_NACLFLUSH', 'p_RALT400', 'p_HEPA5I', 'p_TRUV200/300', 'p_CAL1250', 'p_FURO40', 'p_INFL0.5LF', 'p_FURO20', 'p_ALBU17H', 'p_LACT30L']
['d_458', 'd_070', 'd_799', 'd_276', 'd_789', 'd_276', 'd_305', 'd_496', 'd_296', 'd_571', 'p_TRAM50', 'p_TIOT', 'p_TRUV200/300', 'p_ALBU25', 'p_TRAM50', 'p_FLUT110HFA', 'p_RIFA550', 'p_SENN187', 'p_INFL0.5LF', 'p_BISA5', 'p_ALBU3H', 'p_BISA10R', 'p_CALC500', 'p_ALBU17H', 'p_RALT400', 'p_DOCU100L', 'p_DOCU100L', 'p_SENN187', 'p_NACLFLUSH', 'p_BISA5', 'p_BISA10R', 'p_LACT30L', 'p_ALBU25', 'p_HEPA5I']
['d_571', 'd_486', 'd_789', 'd_572',

In [None]:
# word2vec model 만들기

word2vec_model = Word2Vec(
    sentences=corpus,  
    vector_size=100, # standard baseline size = 100
    window=5, # 그냥 일딴 5로 설정  
    min_count=1, # 한번 나온것도 중요하지 않을까? 라는 생각
    workers=4,         
    sg=1 # Skip-gram model (use sg=0 for CBOW)
)

word2vec_model.save("single_w2v_embeddings.model")


In [None]:
# check word2vec model

word2vec_model = Word2Vec.load("single_w2v_embeddings.model")

print("Vocabulary size:", len(word2vec_model.wv.index_to_key))

diagnosis_embedding = word2vec_model.wv['d_572']
print("Embedding for d_572:", diagnosis_embedding)

similar_diagnoses = word2vec_model.wv.most_similar('d_572')
print("Most similar diagnoses to d_572:", similar_diagnoses)

Vocabulary size: 5145
Embedding for d_572: [ 0.43213654 -0.07269233 -0.35849983  0.1171606  -0.10541403  0.42179534
 -0.3777496   0.00635634 -0.0681117   0.54373664 -0.16981858 -0.04998092
 -0.8013413   0.33431193  0.13255507 -0.07952484 -0.02630047 -0.67499405
 -0.1532572   0.46115348 -0.1299382   0.31279632 -0.20523222 -0.01393937
 -0.31712964  0.1607313   0.09532557 -0.10853101  0.674924   -0.6551267
  0.13211031 -0.36103818  0.32552233  0.00238413 -0.05489227 -0.40951723
 -0.10167913 -0.12177955 -0.11126038 -0.44760582  0.41310793  0.41556662
  0.13027777  0.29256997 -0.00574209 -0.2293887  -0.11028181 -0.632644
  0.12453412 -0.05511767 -0.30835095 -0.18154167 -0.0109947  -0.5619737
  0.02250702 -0.4615514  -0.2117955  -0.39175692 -0.38175717 -0.15047053
  0.5094287   0.3981909  -0.9996721  -1.0411705   0.10332346  0.32288644
 -0.22617443 -0.09451224 -0.1693531  -0.38064343  0.14339     0.10654876
 -0.6787372   0.15967907  0.2724633  -0.05568499 -0.88747454  0.07161202
 -0.32878315