In [33]:
import argparse 
import os 

import scipy.stats as stats
import scipy.spatial as sp
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Change to top level dir 
os.chdir("../../..")

In [16]:
# Load the two PBMC datasets 
pbmc_1 = sc.read_h5ad("resources/h5ad_files/pbmc_2_batch/ding_uf_non_lln_batch_10x_Chromium_v2_A.h5ad")
pbmc_2 = sc.read_h5ad("resources/h5ad_files/pbmc_2_batch/ding_uf_non_lln_batch_10x_Chromium_v2_B.h5ad")



In [17]:
# Examine celltype distributions 
print("PBMC 1 celltypes: ")
print(pbmc_1.obs.celltype.value_counts())
print("PBMC 2 celltypes: ")
print(pbmc_2.obs.celltype.value_counts())

PBMC 1 celltypes: 
Cytotoxic T cell               1174
CD14+ monocyte                  640
CD4+ T cell                     550
B cell                          288
Megakaryocyte                   221
Natural killer cell             166
CD16+ monocyte                  102
Dendritic cell                   55
Plasmacytoid dendritic cell      26
Name: celltype, dtype: int64
PBMC 2 celltypes: 
Cytotoxic T cell               954
CD4+ T cell                    908
B cell                         388
CD14+ monocyte                 379
Natural killer cell            263
Megakaryocyte                  212
CD16+ monocyte                  73
Dendritic cell                  33
Plasmacytoid dendritic cell     12
Name: celltype, dtype: int64


In [18]:
# Get PCA embeddings for batch 1 and batch 2
sc.pp.normalize_total(pbmc_1, target_sum=1e4)
sc.pp.log1p(pbmc_1)
sc.pp.highly_variable_genes(pbmc_1, n_top_genes= 2500)
sc.pp.pca(pbmc_1)

sc.pp.normalize_total(pbmc_2, target_sum=1e4)
sc.pp.log1p(pbmc_2)
sc.pp.highly_variable_genes(pbmc_2, n_top_genes= 2500)
sc.pp.pca(pbmc_2)

In [19]:
# Examine PCA results for batch 1 
print(pbmc_1.obsm["X_pca"].shape)
print(pbmc_1.uns["pca"])

(3222, 50)
{'params': {'zero_center': True, 'use_highly_variable': True}, 'variance': array([23.87086   , 18.486961  ,  7.1285424 ,  4.834932  ,  2.7258003 ,
        2.6111705 ,  2.2125478 ,  1.8687997 ,  1.7128493 ,  1.6090814 ,
        1.5501417 ,  1.4864388 ,  1.4494972 ,  1.4055374 ,  1.3845247 ,
        1.3371836 ,  1.2714379 ,  1.2541604 ,  1.2095394 ,  1.1614062 ,
        1.1529328 ,  1.1458145 ,  1.103052  ,  1.0899106 ,  1.0518675 ,
        1.0462028 ,  1.0280734 ,  1.0244967 ,  0.9990723 ,  0.9961167 ,
        0.98874325,  0.9717327 ,  0.9645092 ,  0.9465862 ,  0.93925995,
        0.9324451 ,  0.91310483,  0.9082902 ,  0.8989654 ,  0.8944627 ,
        0.8811608 ,  0.8687664 ,  0.8645556 ,  0.86323416,  0.8451527 ,
        0.8431874 ,  0.8303624 ,  0.8193776 ,  0.81749725,  0.80744916],
      dtype=float32), 'variance_ratio': array([0.07254787, 0.05618523, 0.02166493, 0.01469423, 0.0082842 ,
       0.00793582, 0.00672433, 0.00567962, 0.00520566, 0.00489029,
       0.00471116, 

In [53]:
# Utilize the cosine distance between the average PCA embedding for celltype i and average PCA embedding for celltype ii
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = pbmc_1.obsm["X_pca"][:, 0:20]
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_cosine_dists = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_cosine_dist = sp.distance.cosine(
            pca_celltype_i_avg,
            pca_celltype_j_avg
        )
        pca_cosine_dists.append(pca_cosine_dist)

In [54]:
# View the PCA cosine distance results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA cosine dist": pca_cosine_dists
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA cosine dist"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,0.0,1.249786,1.147389,0.560559,1.07075,0.994024,1.247328,1.167417,0.449213
CD14+ monocyte,1.249786,0.0,0.532955,1.572898,1.774245,0.324714,1.096489,1.62232,1.08948
CD16+ monocyte,1.147389,0.532955,0.0,1.315778,1.509438,0.332696,1.00617,1.315532,0.835921
CD4+ T cell,0.560559,1.572898,1.315778,0.0,0.660269,1.489113,1.260662,0.906674,0.920727
Cytotoxic T cell,1.07075,1.774245,1.509438,0.660269,0.0,1.709628,1.323047,0.180266,1.246867
Dendritic cell,0.994024,0.324714,0.332696,1.489113,1.709628,0.0,0.993011,1.47959,0.52139
Megakaryocyte,1.247328,1.096489,1.00617,1.260662,1.323047,0.993011,0.0,1.245781,0.964608
Natural killer cell,1.167417,1.62232,1.315532,0.906674,0.180266,1.47959,1.245781,0.0,1.125602
Plasmacytoid dendritic cell,0.449213,1.08948,0.835921,0.920727,1.246867,0.52139,0.964608,1.125602,0.0


The fact that the cosine distance is greater than 1 indicates that the negative PCA values are having an effect in this case. 

If two vectors are along the same axis but pointing to different directions, the cosine distance will be 2 (1 - (-1)). 

We don't want this behaviour as all we care about is whether the celltype embeddings are collinear in given PCA dimensions, such that they are participating along the same axis of biological variation (even if it is occuring in the opposite manner for one). 

Therefore, let's use the absolute values of the PCA embeddings instead and see how this makes a difference.

In [55]:
# This time use the absolute values of the PCA components 
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = np.absolute(pbmc_1.obsm["X_pca"][:, 0:20])
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_cosine_dists = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_cosine_dist = sp.distance.cosine(
            pca_celltype_i_avg,
            pca_celltype_j_avg
        )
        pca_cosine_dists.append(pca_cosine_dist)
        
# View the PCA cosine distance results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA cosine dist": pca_cosine_dists
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA cosine dist"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,0.0,0.38439,0.278715,0.090259,0.19963,0.351186,0.405977,0.160413,0.214605
CD14+ monocyte,0.38439,0.0,0.190363,0.215269,0.092602,0.09359,0.331507,0.145709,0.516888
CD16+ monocyte,0.278715,0.190363,0.0,0.132571,0.115915,0.136446,0.404916,0.167432,0.238775
CD4+ T cell,0.090259,0.215269,0.132571,0.0,0.045646,0.240078,0.376313,0.038746,0.283736
Cytotoxic T cell,0.19963,0.092602,0.115915,0.045646,0.0,0.134949,0.413519,0.015134,0.373164
Dendritic cell,0.351186,0.09359,0.136446,0.240078,0.134949,0.0,0.447982,0.204034,0.309554
Megakaryocyte,0.405977,0.331507,0.404916,0.376313,0.413519,0.447982,0.0,0.461709,0.477603
Natural killer cell,0.160413,0.145709,0.167432,0.038746,0.015134,0.204034,0.461709,0.0,0.402783
Plasmacytoid dendritic cell,0.214605,0.516888,0.238775,0.283736,0.373164,0.309554,0.477603,0.402783,0.0


These results don't necessarily indicate better performance overall in terms of identifying celltype similarities. Perhaps preserving the information about divergent direction is valuable.

Test these results using euclidean distance instead of cosine. 

In [56]:
# This time use euclidean distance  
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = np.absolute(pbmc_1.obsm["X_pca"][:, 0:20])
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_euclidean_dists = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_euclidean_dist = sp.distance.euclidean(
            pca_celltype_i_avg,
            pca_celltype_j_avg
        )
        pca_euclidean_dists.append(pca_euclidean_dist)
        
# View the PCA cosine distance results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA euclidean dist": pca_euclidean_dists
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA euclidean dist"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,0.0,7.715706,5.451938,3.283922,4.193461,6.489493,14.59274,3.815633,4.875742
CD14+ monocyte,7.715706,0.0,5.704632,6.62914,5.208852,4.135499,13.266801,5.452577,9.027034
CD16+ monocyte,5.451938,5.704632,0.0,4.127565,3.695145,4.181748,14.461837,4.180079,5.283035
CD4+ T cell,3.283922,6.62914,4.127565,0.0,1.731078,5.553131,15.086865,2.064936,5.444751
Cytotoxic T cell,4.193461,5.208852,3.695145,1.731078,0.0,4.427451,15.046068,1.215354,6.069794
Dendritic cell,6.489493,4.135499,4.181748,5.553131,4.427451,0.0,14.765397,5.006584,6.307189
Megakaryocyte,14.59274,13.266801,14.461837,15.086865,15.046068,14.765397,0.0,15.203093,15.095599
Natural killer cell,3.815633,5.452577,4.180079,2.064936,1.215354,5.006584,15.203093,0.0,6.40794
Plasmacytoid dendritic cell,4.875742,9.027034,5.283035,5.444751,6.069794,6.307189,15.095599,6.40794,0.0


Once again not making the most sense biologically. At least not as much as the first result.

Test using cosine similarity instead of cosine distance.   

In [59]:
# Define cosine similarity function
def cos_sim(vec_1, vec_2):
    vec_1_norm = np.sqrt(np.sum(vec_1**2))
    vec_2_norm = np.sqrt(np.sum(vec_2**2))
    cos_sim = np.dot(vec_1, vec_2)/(vec_1_norm*vec_2_norm)
    return cos_sim

# This time use cosine similarity  
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = (pbmc_1.obsm["X_pca"][:, 0:20])
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_cosine_sims = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_cosine_sim = cos_sim(
            pca_celltype_i_avg,
            pca_celltype_j_avg
        )
        pca_cosine_sims.append(pca_cosine_sim)
        
# View the PCA cosine similarity results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA cosine sim": pca_cosine_sims
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA cosine sim"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,1.0,-0.249786,-0.147389,0.439442,-0.07075,0.005976,-0.247328,-0.167417,0.550787
CD14+ monocyte,-0.249786,1.0,0.467045,-0.572898,-0.774245,0.675286,-0.096489,-0.62232,-0.08948
CD16+ monocyte,-0.147389,0.467045,1.0,-0.315778,-0.509439,0.667304,-0.00617,-0.315532,0.164079
CD4+ T cell,0.439442,-0.572898,-0.315778,1.0,0.339731,-0.489113,-0.260662,0.093326,0.079273
Cytotoxic T cell,-0.07075,-0.774245,-0.509439,0.339731,1.0,-0.709628,-0.323047,0.819734,-0.246867
Dendritic cell,0.005976,0.675286,0.667304,-0.489113,-0.709628,1.0,0.006989,-0.47959,0.47861
Megakaryocyte,-0.247328,-0.096489,-0.00617,-0.260662,-0.323047,0.006989,1.0,-0.245781,0.035392
Natural killer cell,-0.167417,-0.62232,-0.315532,0.093326,0.819734,-0.47959,-0.245781,1.0,-0.125602
Plasmacytoid dendritic cell,0.550787,-0.08948,0.164079,0.079273,-0.246867,0.47861,0.035392,-0.125602,1.0


This isn't working that well either - the most salient way for now is likely to take the median and use euclidean distance. The distance should be a weighted average of the variance explained by the PCs. 

In [63]:
# This time use euclidean distance  
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = np.absolute(pbmc_1.obsm["X_pca"][:, 0:20])
top_20_pc_weights = pbmc_1.uns["pca"]["variance_ratio"][0:20]
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_euclidean_dists = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_euclidean_dist = sp.distance.euclidean(
            pca_celltype_i_avg,
            pca_celltype_j_avg,
            w = top_20_pc_weights
        )
        pca_euclidean_dists.append(pca_euclidean_dist)
        
# View the PCA cosine distance results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA euclidean dist": pca_euclidean_dists
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA euclidean dist"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,0.0,1.866053,0.876003,0.498025,0.777741,1.330557,3.150909,0.756018,0.648659
CD14+ monocyte,1.866053,0.0,1.284994,1.708088,1.325414,0.829119,2.929336,1.313435,2.16222
CD16+ monocyte,0.876003,1.284994,0.0,0.601172,0.373774,0.622028,3.206627,0.462089,0.97512
CD4+ T cell,0.498025,1.708088,0.601172,0.0,0.432074,1.120367,3.286296,0.491913,0.71877
Cytotoxic T cell,0.777741,1.325414,0.373774,0.432074,0.0,0.746866,3.285017,0.166474,1.029091
Dendritic cell,1.330557,0.829119,0.622028,1.120367,0.746866,0.0,3.221508,0.777415,1.47379
Megakaryocyte,3.150909,2.929336,3.206627,3.286296,3.285017,3.221508,0.0,3.308429,3.37112
Natural killer cell,0.756018,1.313435,0.462089,0.491913,0.166474,0.777415,3.308429,0.0,1.085817
Plasmacytoid dendritic cell,0.648659,2.16222,0.97512,0.71877,1.029091,1.47379,3.37112,1.085817,0.0


For completeness, test the results of the weighted cosine distance 

In [64]:
# Utilize the cosine distance between the average PCA embedding for celltype i and average PCA embedding for celltype ii
batch_1_celltypes = np.unique(pbmc_1.obs.celltype.__array__())
batch_1_pca_top_20 = pbmc_1.obsm["X_pca"][:, 0:20]
top_20_pc_weights = pbmc_1.uns["pca"]["variance_ratio"][0:20]
celltype_is = []
celltype_js = []
celltype_i_avgs = []
celltype_j_avgs = []
pca_cosine_dists = []
for celltype_i in batch_1_celltypes:
    for celltype_j in batch_1_celltypes:
        celltype_is.append(celltype_i)
        celltype_js.append(celltype_j)
        pca_celltype_i = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_i
        ]
        pca_celltype_j = batch_1_pca_top_20[
            pbmc_1.obs.celltype == celltype_j
        ]
        pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
        pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
        celltype_i_avgs.append(pca_celltype_i_avg)
        celltype_j_avgs.append(pca_celltype_j_avg)
        pca_cosine_dist = sp.distance.cosine(
            pca_celltype_i_avg,
            pca_celltype_j_avg,
            w = top_20_pc_weights
        )
        pca_cosine_dists.append(pca_cosine_dist)
        
# View the PCA cosine distance results 
dist_results_df = pd.DataFrame({
    "Celltype 1": celltype_is,
    "Celltype 2": celltype_js,
    "PCA cosine dist": pca_cosine_dists
})
dist_results_df = dist_results_df.pivot(
    "Celltype 1", "Celltype 2", "PCA cosine dist"
)
dist_results_df

Celltype 2,B cell,CD14+ monocyte,CD16+ monocyte,CD4+ T cell,Cytotoxic T cell,Dendritic cell,Megakaryocyte,Natural killer cell,Plasmacytoid dendritic cell
Celltype 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B cell,0.0,1.361822,1.358707,0.293029,0.714012,1.298105,1.432737,0.83666,0.508297
CD14+ monocyte,1.361822,0.0,0.14687,1.781379,1.884616,0.088274,1.02952,1.840903,1.020896
CD16+ monocyte,1.358707,0.14687,0.0,1.754154,1.836614,0.059529,0.895897,1.756883,0.759778
CD4+ T cell,0.293029,1.781379,1.754154,0.0,0.24559,1.795752,1.387004,0.377928,0.951895
Cytotoxic T cell,0.714012,1.884616,1.836614,0.24559,0.0,1.904521,1.35603,0.034313,1.232098
Dendritic cell,1.298105,0.088274,0.059529,1.795752,1.904521,0.0,0.884158,1.830712,0.682478
Megakaryocyte,1.432737,1.02952,0.895897,1.387004,1.35603,0.884158,0.0,1.306662,0.850062
Natural killer cell,0.83666,1.840903,1.756883,0.377928,0.034313,1.830712,1.306662,0.0,1.216049
Plasmacytoid dendritic cell,0.508297,1.020896,0.759778,0.951895,1.232098,0.682478,0.850062,1.216049,0.0


These results actually look the most reasonable out of all of them - especially for the monocytes. Therefore, let's
use this cosine distance reweighted metric. 

Biologically, it's salient in that orthonormal directions in the PCA space indicate participation/non-participation in a biological process (which is what each principal component is assumed to be), whereas opposite directionality may indicate the upregulation or downregulation of a given process.

Further and more thorough investigation is likely needed to build upon this significantly, but cosine distance with PCA variance reweighting seems a reasonable start.