In [1]:
import os
os.chdir("/root/data/DBP_sa_bc/")
from os.path import join as pj
import argparse
import sys
sys.path.append("modules")
import utils
import numpy as np
import scib
import scib.metrics as me
import anndata as ad
import scipy
import pandas as pd
from scipy.stats import kurtosis
import re
from scipy.stats import f_oneway

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='wnn_rna')
parser.add_argument('--experiment', type=str, default='e1')
parser.add_argument('--model', type=str, default='default')
parser.add_argument('--init_model', type=str, default='sp_00001899')
parser.add_argument('--method', type=str, default='liger')
parser.add_argument('--K', type=int, default='50')
o, _ = parser.parse_known_args()  # for python interactive
# o = parser.parse_args()

In [3]:
K = o.K
break_index_dir = pj("result", o.task, o.experiment, o.model, "predict", o.init_model)

if "DBP_sa_bc" in o.method:
    result_dir = pj("result", "comparison", o.task, o.method, o.experiment, o.init_model)
else:
    result_dir = pj("result", "comparison", o.task, o.method)
cfg_task = re.sub("_atlas|_generalize|_transfer|_ref_.*", "", o.task)
data_config = utils.load_toml("configs/data.toml")[cfg_task]
for k, v in data_config.items():
    vars(o)[k] = v
model_config = utils.load_toml("configs/model.toml")["default"]
if o.model != "default":
    model_config.update(utils.load_toml("configs/model.toml")[o.model])
for k, v in model_config.items():
    vars(o)[k] = v
o.s_joint, o.combs, *_ = utils.gen_all_batch_ids(o.s_joint, o.combs)


In [4]:
# Load cell type labels
if o.task == "wnn_rna":
    labels = []
    for raw_data_dir in o.raw_data_dirs:
        label = utils.load_csv(pj(raw_data_dir, "label", "meta.csv"))
        labels += utils.transpose_list(label)[10][1:]
    labels = np.array(labels)
    print(np.unique(labels))
elif o.task == "lung_ts":
    labels = []
    for raw_data_dir in o.raw_data_dirs:
        label = utils.load_csv(pj(raw_data_dir, "label", "meta.csv"))
        labels += utils.transpose_list(label)[13][1:]
        # labels += utils.transpose_list(label)[14][1:]
    labels = np.array(labels)
    print(np.unique(labels))

['B' 'CD4 T' 'CD8 T' 'DC' 'Mono' 'NK' 'other' 'other T']


In [5]:
if o.method == "DBP_sa_bc":
    # Load predicted latent variables
    o.mods = ["rna"]
    o.pred_dir = pj("result", o.task, o.experiment, o.model, "predict", o.init_model)
    pred = utils.load_predicted(o, input=True, batch_correct=True)
    
    w = pred["w"]["joint"]
    c = pred["z"]["joint"][:, :o.dim_c]*w
    s = pred["s"]["joint"]
    index = np.loadtxt(pj(break_index_dir, "break_index.csv"), delimiter=",", dtype=int)
    c_ord = c[:,index]
    c_bre = c_ord[:, :K]
# x = pred["x"]["rna"]
# # x_bc = pred["x_bc"]["rna"]
    ngenes = pred["x"]["rna"].shape[1]
    A = np.array(pred["A"]["joint"][:ngenes,:])
    A_ord = A[:,index]
    A_bre = A_ord[:, :K]
    
    z = c_bre
    # x_r = x_bc
    load = A_bre.astype(np.float32)
    F = ["F{}".format(i+1) for i in range(z.shape[1])]
    dfc = pd.DataFrame(abs(z))
    # dfc = pd.DataFrame(z)
    dfc = (dfc-dfc.min(axis=0))/(dfc.max(axis=0)-dfc.min(axis=0))
    dfc.index = labels
    dfc.columns = F
elif o.method in ["mofa", "liger"]:
    z = utils.load_csv(pj(result_dir, "embeddings.csv"))
    z = np.array(z)[1:, 1:].astype(np.float32)
    F = ["F{}".format(i+1) for i in range(z.shape[1])]
    # F = ["F{}".format(i+1) for i in range(31)]
    dfc = pd.DataFrame(abs(z))
    # dfc = pd.DataFrame(z)
    dfc = (dfc-dfc.min(axis=0))/(dfc.max(axis=0)-dfc.min(axis=0))
    dfc.index = labels
    dfc.columns = F
    # load = utils.load_csv(pj(result_dir, "loadings.csv"))
    # load = np.array(load)[1:, 1:].astype(np.float32)
elif o.method in ["LDVAE","scETM"]:
    z = utils.load_csv(pj(result_dir, "embeddings.csv"))
    z = np.array(z).astype(np.float32)
    F = ["F{}".format(i+1) for i in range(z.shape[1])]
    # F = ["F{}".format(i+1) for i in range(31)]
    dfc = pd.DataFrame(abs(z))
    # dfc = pd.DataFrame(z)
    dfc = (dfc-dfc.min(axis=0))/(dfc.max(axis=0)-dfc.min(axis=0))
    dfc.index = labels
    dfc.columns = F

In [None]:
results = {}
# l2/l1
l1 = np.linalg.norm(z, ord=1, axis=0)
l2 = np.linalg.norm(z, ord=2, axis=0)
l2_l1 = l2/l1
results['l2_l1'] = l2_l1.mean()
print("l2_l1: " + str(results['l2_l1']))

# k4
k4 = kurtosis(z)
results['k4'] = k4.mean()
print("k4: " + str(results['k4']))

# # HG
# hg = -np.log(abs(z)**2 + 1)
# results['hg'] = hg.mean()
# print("hg: " + str(results['hg']))

l0 = 1 - np.count_nonzero(z)/np.size(z)
results['l0'] = l0
print("L0 Norm:", l0)

# ## Significant
# mean_values = dfc.groupby(dfc.index).mean()
# proportion = mean_values.div(mean_values.sum(axis=0))

# distances_mean = []
# for f in proportion.columns:
#     vector = proportion[f]
#     mean_value = mean_values[f]
#     # half_vec = vector.max()/2
#     mean_vec = vector.mean()
#     positive_values = mean_value[vector > mean_vec]
#     non_positive_values = mean_value[vector <= mean_vec]
#     distances = np.abs(positive_values.values[:, np.newaxis] - non_positive_values.values)
#     distances_mean.append(distances.mean()) 
# Significant = np.nanmean(distances_mean)
# results['Significant'] = distances
# print("Significant: " + str(results['Significant']))


In [None]:
df = pd.DataFrame({
    'l2_l1':           [results['l2_l1']],
    'k4':    [results['k4']],
    'l0':      [results['l0']],
    # 'Significant':            [results['Significant']],
})
print(df)

utils.mkdirs(result_dir, remove_old=False)
df.to_excel(pj(result_dir, "metrics_fa"+".xlsx"), index=False)

### F statistic

In [6]:
#  F1
result_dict = {}
cts = np.unique(labels)
dfc = dfc.reset_index()

for factor in F:
    ct_results = {}
    for ct in cts:
        ctv = dfc[dfc['index'] == ct][factor]
        out_ctv = dfc[dfc['index'] != ct][factor]
        f_value, p_value = f_oneway(ctv, out_ctv)
        # ct_results[ct] = (f_value, p_value)
        ct_results[ct] = f_value
    result_dict[factor] = ct_results

In [7]:
result_df1 = pd.DataFrame(result_dict)
v1 = result_df1.sub(result_df1.min(axis=1), axis=0)
v2 = result_df1.max(axis=1)-result_df1.min(axis=1)
df1 = v1.div(v2, axis=0)

In [8]:
result_df1 = pd.DataFrame(result_dict)
# v1 = result_df1.sub(result_df1.min(axis=1), axis=0)
# v2 = result_df1.max(axis=1)-result_df1.min(axis=1)
# df1 = v1.div(v2, axis=0)
# df1.index = ["GEP{}".format(num) for num in df1.index]
df1 = result_df1
df1.reset_index()

df1.to_csv(pj(result_dir, "oneway_results.csv"), index=True)