### Multiple Word2Vec

- 3 seperate w2v embedding for diagnosis, procedures, and drugs

In [1]:
# imports
import pickle
from gensim.models import Word2Vec

In [4]:
# Load data
X_filePath = '../../../../data/preprocessed_X_visit_over3.pkl'
with open(X_filePath, 'rb') as f:
    data = pickle.load(f)

In [5]:
# Prepare the Word2Vec input for diagnosis
corpusDiagnosis = []
for subject_id, visits in data.items():
    for hadm_id, features in visits.items():
        corpusDiagnosis.append(features['diagnoses'])

In [6]:
# Prepare the Word2Vec input for procedures
corpusProcedures = []
for subject_id, visits in data.items():
    for hadm_id, features in visits.items():
        corpusProcedures.append(features['procedures'])

In [7]:
# Prepare the Word2Vec input for drugs
corpusDrugs = []
for subject_id, visits in data.items():
    for hadm_id, features in visits.items():
        corpusDrugs.append(features['drugs'])

In [None]:
# word2vec model 만들기

word2vec_model = Word2Vec(
    sentences=corpusProcedures,  
    vector_size=100, # standard baseline size = 100
    window=5, # 그냥 일딴 5로 설정  
    min_count=1, # 한번 나온것도 중요하지 않을까? 라는 생각
    workers=4,         
    sg=1 # Skip-gram model (use sg=0 for CBOW)
)

word2vec_model.save("procedure_w2v_embeddings.model")


In [None]:
# check word2vec model

word2vec_model = Word2Vec.load("single_w2v_embeddings.model")

print("Vocabulary size:", len(word2vec_model.wv.index_to_key))

diagnosis_embedding = word2vec_model.wv['d_572']
print("Embedding for d_572:", diagnosis_embedding)

similar_diagnoses = word2vec_model.wv.most_similar('d_572')
print("Most similar diagnoses to d_572:", similar_diagnoses)

Vocabulary size: 5145
Embedding for d_572: [ 0.43213654 -0.07269233 -0.35849983  0.1171606  -0.10541403  0.42179534
 -0.3777496   0.00635634 -0.0681117   0.54373664 -0.16981858 -0.04998092
 -0.8013413   0.33431193  0.13255507 -0.07952484 -0.02630047 -0.67499405
 -0.1532572   0.46115348 -0.1299382   0.31279632 -0.20523222 -0.01393937
 -0.31712964  0.1607313   0.09532557 -0.10853101  0.674924   -0.6551267
  0.13211031 -0.36103818  0.32552233  0.00238413 -0.05489227 -0.40951723
 -0.10167913 -0.12177955 -0.11126038 -0.44760582  0.41310793  0.41556662
  0.13027777  0.29256997 -0.00574209 -0.2293887  -0.11028181 -0.632644
  0.12453412 -0.05511767 -0.30835095 -0.18154167 -0.0109947  -0.5619737
  0.02250702 -0.4615514  -0.2117955  -0.39175692 -0.38175717 -0.15047053
  0.5094287   0.3981909  -0.9996721  -1.0411705   0.10332346  0.32288644
 -0.22617443 -0.09451224 -0.1693531  -0.38064343  0.14339     0.10654876
 -0.6787372   0.15967907  0.2724633  -0.05568499 -0.88747454  0.07161202
 -0.32878315