In [10]:
import numpy as np
from sklearn.metrics import recall_score
import torch
from sklearn.metrics.pairwise import cosine_similarity


1. We need to compute the latent space representation and the std and mean to obtain the mask to apply over the training points
2. then compute the average across modalities and across samples
3. compute the std and select the training points that fall outside of the distribution

In [11]:
data_folder_inference_data = "../shamoutlab/"
task = "phenotyping"
seed = 0

In [12]:
npz_data = np.load(f"{data_folder_inference_data}/train_points_for_context_{task}_original_splits.npz")

In [13]:
latent_data = np.load(f"{data_folder_inference_data}/latent_space_representation_phenotyping.npz")

In [14]:
print(latent_data.files)

['ehr_features', 'cxr_features']


In [17]:
ehr_features = latent_data["ehr_features"]
ehr_features.shape

(7756, 256)

In [18]:
cxr_feats = latent_data["cxr_features"]
cxr_feats.shape

(7756, 256)

In [19]:
print(npz_data.files)

['ehr_inputs', 'ehr_targets', 'cxr_inputs', 'cxr_targets', 'ehr_cxr_pairs', 'preds', 'targets']


In [20]:
ehr_inputs = npz_data["ehr_inputs"]
ehr_inputs.shape

(7756, 100, 76)

In [21]:
ehr_targets = npz_data["ehr_targets"]
ehr_targets.shape

(7756, 25)

In [22]:
(ehr_targets.sum(axis=0)/7756)*100

array([32.27178958,  8.14853017,  8.63847344, 37.91903043, 23.22073234,
       16.3744198 , 21.76379577, 10.71428571, 31.03403816, 30.94378546,
       11.81021145, 20.52604435, 40.5234657 , 44.31407942, 45.70654977,
        7.14285714, 21.09334709, 16.32284683, 12.86745745,  6.03403816,
        9.77307891, 18.78545642, 28.13305828, 22.40845797, 17.93450232])

In [23]:
(ehr_targets.sum(axis=1) > 0).shape

(7756,)

In [24]:
(ehr_targets.sum(axis=1) > 0)

array([ True,  True,  True, ...,  True,  True,  True])

In [13]:
for i in range(1,26):
    print(f"{(ehr_targets.sum(axis=1) > i).sum()} patients with {i} conditions")

7052 patients with 1 conditions
6387 patients with 2 conditions
5563 patients with 3 conditions
4647 patients with 4 conditions
3652 patients with 5 conditions
2724 patients with 6 conditions
1905 patients with 7 conditions
1253 patients with 8 conditions
746 patients with 9 conditions
413 patients with 10 conditions
235 patients with 11 conditions
114 patients with 12 conditions
36 patients with 13 conditions
13 patients with 14 conditions
3 patients with 15 conditions
0 patients with 16 conditions
0 patients with 17 conditions
0 patients with 18 conditions
0 patients with 19 conditions
0 patients with 20 conditions
0 patients with 21 conditions
0 patients with 22 conditions
0 patients with 23 conditions
0 patients with 24 conditions
0 patients with 25 conditions


In [90]:
mask = ((ehr_targets.sum(axis=1) > 15).astype(int))

In [92]:
mask.shape

(7756,)

In [102]:
np.tile(mask, (3,1)).shape

(3, 7756)

In [36]:
mask.shape

(7756,)

In [55]:
mask_2d = np.repeat(mask[:, np.newaxis], 25, axis=1)  # shape: (7756, 25)

In [57]:
mask_2d.shape

(7756, 25)

In [69]:
ehr_inputs.shape

(7756, 100, 76)

In [76]:
(ehr_inputs.T*mask).T.shape

(7756, 100, 76)

In [73]:
(ehr_targets*mask_2d).shape

(7756, 25)

In [80]:
(ehr_targets.T*mask

(25, 7756)

In [86]:
np.dot(mask, ehr_targets.T)

ValueError: shapes (7756,) and (25,7756) not aligned: 7756 (dim 0) != 25 (dim 0)

In [26]:
ehr_targets[mask].shape

(3, 25)

In [72]:
((ehr_targets.T*mask).T).shape

(7756, 25)

In [33]:
selected_labels = [0,1,2,3,4]
(ehr_targets[:, selected_labels].sum(axis=1) == 5).sum()

5

In [45]:
imbalanced_labels = [1, 2, 5, 6, 10, 11, 15, 17, 18, 19, 20, 21, 23, 24]
(ehr_targets[:, imbalanced_labels].sum(axis=1) > 5).sum()

176

In [43]:
len(imbalanced_labels)

14

In [50]:
np.argwhere(ehr_targets.sum(axis=1) == 0)

array([[  39],
       [  54],
       [ 118],
       [ 152],
       [ 170],
       [ 211],
       [ 214],
       [ 244],
       [ 269],
       [ 276],
       [ 285],
       [ 330],
       [ 359],
       [ 378],
       [ 417],
       [ 457],
       [ 465],
       [ 473],
       [ 501],
       [ 505],
       [ 565],
       [ 579],
       [ 596],
       [ 604],
       [ 645],
       [ 670],
       [ 675],
       [ 677],
       [ 678],
       [ 716],
       [ 729],
       [ 752],
       [ 762],
       [ 774],
       [ 781],
       [ 847],
       [ 870],
       [ 914],
       [ 917],
       [ 931],
       [ 953],
       [ 992],
       [1033],
       [1043],
       [1077],
       [1089],
       [1155],
       [1198],
       [1205],
       [1305],
       [1327],
       [1362],
       [1370],
       [1378],
       [1420],
       [1423],
       [1516],
       [1517],
       [1531],
       [1560],
       [1623],
       [1624],
       [1626],
       [1636],
       [1641],
       [1661],
       [16

In [52]:
ehr_targets[40]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)

In [7]:
for task in ["in-hospital-mortality", "phenotyping"]:
    # This will have to be per seed!!
    # latent_points = np.load(f"shamoutlab/latent_space_representation_{task}.npz")

    for seed in [0,1,2,3,4]:
        # latent_points = np.load(f"shamoutlab/latent_space_representation_{task}_seed_{seed}.npz")
        original_train_points = np.load(f"shamoutlab/train_points_for_context_{task}_seed_{seed}.npz")

        shape = original_train_points["ehr_inputs"].shape[0]

        # latent_ehr_points = latent_points["ehr_features"]
        # latent_cxr_points = latent_points["cxr_features"]

        # cosine_distances = np.diag(cosine_similarity(latent_ehr_points, latent_cxr_points))
        # mean_distance = cosine_distances.mean()
        # std_distance = cosine_distances.std()

        for n_std in [1,2]:
            # n_std = 1
            # left_tail = mean_distance - n_std*std_distance
            # context_points_idx = (cosine_distances < left_tail)
            context_points_idx = np.random.choice([True, False], size=shape, p=[0.1, 0.9])

            # print(f"Misclassified points: {context_points_idx.sum()} | Mean: {mean_distance:0.6f} | STD: {std_distance:0.6f} | Left tail: {left_tail:0.6f}")

            ehr_inputs = original_train_points["ehr_inputs"][context_points_idx]
            ehr_targets = original_train_points["ehr_targets"][context_points_idx]
            cxr_inputs = original_train_points["cxr_inputs"][context_points_idx]
            cxr_targets = original_train_points["cxr_targets"][context_points_idx]
            ehr_cxr_pairs = original_train_points["ehr_cxr_pairs"][context_points_idx]

            print(f"Shapes\n EHR inputs: {ehr_inputs.shape} | EHR targets: {ehr_targets.shape} | CXR inputs: {cxr_inputs.shape} | CXR targets: {cxr_targets.shape} | EHR-CXR pairs: {ehr_cxr_pairs.shape}")

            np.savez(f"shamoutlab/data/MedFuse/ContextPoints/Context-III/{task}/context_set_3_cos_sim_type1_{task}_{n_std}_std_seed_{seed}.npz", 
                                        ehr_inputs=ehr_inputs, 
                                        ehr_targets=ehr_targets, 
                                        cxr_inputs=cxr_inputs, 
                                        cxr_targets=cxr_targets, 
                                        ehr_cxr_pairs=ehr_cxr_pairs)

In [8]:
ehr_inputs.shape

(7841, 100, 76)