In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scanpy as sc 
import pandas as pd
import numpy as np

import sys 
sys.path.append('../src')

from spaceoracle.prophets import Prophet



In [3]:
adata = sc.read_h5ad(
    '/ix/djishnu/shared/djishnu_kor11/training_data/day3_lymph_rep_1.h5ad')

pythia = Prophet(
    adata=adata,
    models_dir='/ix/djishnu/shared/djishnu_kor11/models_v2',
    annot='rctd_cluster',
    annot_labels='rctd_celltypes'
)

In [4]:
pythia.compute_betas()
pythia.beta_dict.ligands_set # These are the only active ligands 

100%|██████████| 2000/2000 [01:44<00:00, 19.09it/s]


{'Angpt2',
 'Angptl4',
 'C3',
 'C4b',
 'Ccl11',
 'Ccl2',
 'Ccl20',
 'Ccl4',
 'Ccl5',
 'Ccl6',
 'Ccl7',
 'Ccl8',
 'Csf1',
 'Cxcl1',
 'Cxcl13',
 'Cxcl9',
 'Ebi3',
 'Gdf10',
 'Gzma',
 'Il12b',
 'Il15',
 'Il2',
 'Il21',
 'Il24',
 'Il4',
 'Il7',
 'Lif',
 'Lta',
 'Osm',
 'Sema3a',
 'Sema3d',
 'Tnfsf13b',
 'Tnfsf14'}

In [267]:
target_index = list(adata.var_names).index('Ccl8') # KO of Ccl8 to predict Il2 expression
target_index

457

In [268]:
# Set up variables 

gene_mtx = pythia.adata.layers['imputed_count']
simulation_input = gene_mtx.copy()
simulation_input[:, target_index] = 0 

weighted_ligands_0 = pythia._compute_weighted_ligands(gene_mtx)
weighted_ligands_0 = weighted_ligands_0.reindex(columns=adata.var_names, fill_value=0)

weighted_ligands_1 = pythia._compute_weighted_ligands(simulation_input)
weighted_ligands_1 = weighted_ligands_1.reindex(columns=adata.var_names, fill_value=0)


In [269]:
# Plug n chug to recompute y

betadata = pythia.beta_dict.data['Il2']
betadata.head()

Unnamed: 0,beta0,beta_Cebpb,beta_Ctcf,beta_Mga,beta_Gdf10$Acvr2a,beta_Ccl8$Ccr2,beta_Ccl6$Ccr2,beta_Ccl7$Ccr2,beta_Ccl2$Ccr2,beta_Ccl4$Ccr5,...,beta_Il24#Ctcf,beta_Il21#Ctcf,beta_Ebi3#Mga,beta_Il21#Mga,beta_Il24#Mga,beta_Il12b#Mga,rctd_cluster,rctd_celltypes,x,y
AAAAAACGGTAGAT,0.008607,0.0,-0.0,0.031692,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,-0.0,0.000927,-0.002002,0.0,-0.0,0.025076,0,B-cell,-4252.0,2626.2
AAAAAATCCGGCGA,0.01032,-0.010856,0.0,0.0,-0.0,0.002998,-0.000725,0.0,-0.0,-0.0,...,0.004045,0.0,-0.002125,-0.0,0.001758,-0.001702,4,Tfh,-2952.4,1975.3
AAAAACCGTCAGAC,0.010287,0.0,-0.0,0.011,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,-0.0,-0.000857,-3.8e-05,-0.0,0.0,0.014484,0,B-cell,-2480.0,1516.6
AAAAACGCTCCGAC,0.009967,0.0,-0.0,0.013757,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,-0.0,-0.000683,0.00026,-0.0,0.0,0.015646,0,B-cell,-2764.1,1255.7
AAAAAGCACGTGTT,0.010653,0.0,-0.0,0.01646,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,-0.0,-0.000474,-0.000928,-0.0,0.0,0.017206,0,B-cell,-3925.9,1618.5


In [270]:
# test original y (without KO)

In [271]:
gene_df = pd.DataFrame(gene_mtx.copy(), columns=adata.var_names, index=adata.obs_names)

In [272]:
import copy
y = copy.deepcopy(betadata)
y.drop(columns=['rctd_cluster', 'rctd_celltypes', 'x', 'y'], inplace=True)

for l, r in betadata.lr_pairs:
    label = f'beta_{l}${r}'
    l_val = weighted_ligands_0[l]
    r_val = gene_df[r]
    y[label] = y[label] * l_val * r_val

for l, tf in betadata.tfl_pairs:
    label = f'beta_{l}#{tf}'
    tf_val = gene_df[tf]
    l_val = weighted_ligands_0[l]
    y[label] = y[label] * tf_val * l_val

for tf in betadata.tfs:
    y[f'beta_{tf}'] = y[f'beta_{tf}'] * tf_val

y

Unnamed: 0,beta0,beta_Cebpb,beta_Ctcf,beta_Mga,beta_Gdf10$Acvr2a,beta_Ccl8$Ccr2,beta_Ccl6$Ccr2,beta_Ccl7$Ccr2,beta_Ccl2$Ccr2,beta_Ccl4$Ccr5,...,beta_Il21#Cebpb,beta_Il4#Cebpb,beta_Ebi3#Cebpb,beta_Ebi3#Ctcf,beta_Il24#Ctcf,beta_Il21#Ctcf,beta_Ebi3#Mga,beta_Il21#Mga,beta_Il24#Mga,beta_Il12b#Mga
AAAAAACGGTAGAT,0.008607,0.000000,-0.000000,0.000536,-0.0,0.000000,-0.000000,-0.0,0.000000,0.0,...,0.0,0.000087,0.000000,1.382968e-03,-0.000000,0.000023,-1.223303e-05,0.0,-0.000000,0.000099
AAAAAATCCGGCGA,0.010320,-0.000232,0.000000,0.000000,-0.0,0.000090,-0.000006,0.0,-0.000000,-0.0,...,0.0,0.000000,-0.000000,5.963625e-07,0.000017,0.000000,-1.344681e-05,-0.0,0.000002,-0.000018
AAAAACCGTCAGAC,0.010287,0.000000,-0.000000,0.000193,-0.0,0.000000,-0.000000,-0.0,0.000000,0.0,...,0.0,0.000033,-0.000000,4.736279e-04,-0.000000,-0.000027,-2.741920e-07,-0.0,0.000000,0.000035
AAAAACGCTCCGAC,0.009967,0.000000,-0.000000,0.000365,-0.0,0.000000,-0.000000,-0.0,0.000000,0.0,...,0.0,0.000058,-0.000000,8.583114e-04,-0.000000,-0.000046,2.950378e-06,-0.0,0.000000,0.000067
AAAAAGCACGTGTT,0.010653,0.000000,-0.000000,0.000290,-0.0,0.000000,-0.000000,-0.0,0.000000,0.0,...,0.0,0.000066,0.000000,1.411926e-03,-0.000000,-0.000029,-1.053518e-05,-0.0,0.000000,0.000085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTTTTCGCGGCTA,0.009434,-0.000224,0.000000,0.000000,-0.0,0.000445,-0.000019,0.0,-0.000000,-0.0,...,0.0,0.000000,-0.000000,-1.485320e-05,0.000049,0.000000,-2.214176e-05,-0.0,0.000008,-0.000012
TTTTTTGGGCTGGT,0.008219,0.000000,0.000445,-0.000000,-0.0,-0.000000,0.000017,-0.0,0.000002,-0.0,...,0.0,0.000486,0.000004,7.595042e-05,0.000028,-0.000000,2.212685e-05,-0.0,-0.000000,-0.000000
TTTTTTGTACCCAC,0.008514,0.000000,0.000678,-0.000000,-0.0,-0.000000,0.000014,-0.0,0.000002,-0.0,...,0.0,0.000406,0.000003,1.061748e-04,0.000095,-0.000000,3.598585e-05,-0.0,0.000000,-0.000000
TTTTTTGTACGATT,0.009535,0.000000,-0.000000,0.000646,-0.0,0.000000,-0.000000,0.0,0.000000,0.0,...,0.0,0.000042,0.000000,2.026158e-03,-0.000000,0.000017,-1.895038e-05,-0.0,-0.000000,0.000220


In [273]:
preds = y.sum(axis=1)
preds = pd.concat([gene_df['Il2'], preds], axis=1)
preds = preds.rename(columns={0: 'y_0'})
preds

Unnamed: 0,Il2,y_0
AAAAAACGGTAGAT,0.008844,0.008026
AAAAAATCCGGCGA,0.009123,0.010604
AAAAACCGTCAGAC,0.009442,0.010425
AAAAACGCTCCGAC,0.016603,0.011082
AAAAAGCACGTGTT,0.010626,0.010255
...,...,...
TTTTTTCGCGGCTA,0.008604,0.006488
TTTTTTGGGCTGGT,0.007852,0.007995
TTTTTTGTACCCAC,0.008473,0.008332
TTTTTTGTACGATT,0.008791,0.011268


In [274]:
gene_df[['Cebpb', 'Ctcf', 'Mga']]

Unnamed: 0,Cebpb,Ctcf,Mga
AAAAAACGGTAGAT,0.009046,0.063595,0.016904
AAAAAATCCGGCGA,0.004028,0.064521,0.021324
AAAAACCGTCAGAC,0.006471,0.039963,0.017552
AAAAACGCTCCGAC,0.008116,0.061389,0.026500
AAAAAGCACGTGTT,0.005580,0.060482,0.017608
...,...,...,...
TTTTTTCGCGGCTA,0.017977,0.066515,0.021119
TTTTTTGGGCTGGT,0.006834,0.091793,0.020035
TTTTTTGTACCCAC,0.004892,0.086803,0.029815
TTTTTTGTACGATT,0.007202,0.079428,0.023220


In [275]:
# see what the expected KO value should be 
# aka this manual computation but with gene_mtx_1

gene_df_1 = pd.DataFrame(simulation_input.copy(), columns=adata.var_names, index=adata.obs_names)

y_1 = copy.deepcopy(betadata)
y_1.drop(columns=['rctd_cluster', 'rctd_celltypes', 'x', 'y'], inplace=True)

for l, r in betadata.lr_pairs: 
    label = f'beta_{l}${r}'

    # if beta coef is 0, then use the original values
    # l_val = np.where(y_1[label]==0, weighted_ligands_0[l], weighted_ligands_1[l])
    # r_val = np.where(y_1[label]==0, gene_df[r], gene_df_1[r])
    l_val = weighted_ligands_1[l]
    r_val = gene_df_1[r]

    if not (np.all(l_val == weighted_ligands_0[l]) and np.all(r_val == gene_df[r])):
        print(label)

    y_1[label] = y_1[label] * l_val * r_val

for l, tf in betadata.tfl_pairs:
    label = f'beta_{l}#{tf}'
    # l_val = np.where(y_1[label]==0, weighted_ligands_0[l], weighted_ligands_1[l])
    # tf_val = np.where(y_1[label]==0, gene_df[tf], gene_df_1[tf])
    l_val = weighted_ligands_1[l]
    tf_val = gene_df_1[tf]

    y_1[label] = y_1[label] * tf_val * l_val

for tf in betadata.tfs:
    # tf_val = np.where(y_1[f'beta_{tf}']==0, gene_df[tf], gene_df_1[tf])
    # if not np.all(gene_df[tf] == tf_val):
    #     print(f'beta_{tf}')
    tf_val = gene_df_1[tf]
    y_1[f'beta_{tf}'] = y_1[f'beta_{tf}'] * tf_val

preds['y_1'] = y_1.sum(axis=1)
preds

beta_Ccl8$Ccr2
beta_Ccl8$Ccr5
beta_Ccl8$Ackr4
beta_Ccl8$Ackr1


Unnamed: 0,Il2,y_0,y_1
AAAAAACGGTAGAT,0.008844,0.008026,0.008026
AAAAAATCCGGCGA,0.009123,0.010604,0.010668
AAAAACCGTCAGAC,0.009442,0.010425,0.010425
AAAAACGCTCCGAC,0.016603,0.011082,0.011082
AAAAAGCACGTGTT,0.010626,0.010255,0.010255
...,...,...,...
TTTTTTCGCGGCTA,0.008604,0.006488,0.010800
TTTTTTGGGCTGGT,0.007852,0.007995,0.009635
TTTTTTGTACCCAC,0.008473,0.008332,0.009740
TTTTTTGTACGATT,0.008791,0.011268,0.011268


In [276]:
preds['y_1-y_0'] = preds['y_1'] - preds['y_0']
preds

Unnamed: 0,Il2,y_0,y_1,y_1-y_0
AAAAAACGGTAGAT,0.008844,0.008026,0.008026,0.000000
AAAAAATCCGGCGA,0.009123,0.010604,0.010668,0.000064
AAAAACCGTCAGAC,0.009442,0.010425,0.010425,0.000000
AAAAACGCTCCGAC,0.016603,0.011082,0.011082,0.000000
AAAAAGCACGTGTT,0.010626,0.010255,0.010255,0.000000
...,...,...,...,...
TTTTTTCGCGGCTA,0.008604,0.006488,0.010800,0.004311
TTTTTTGGGCTGGT,0.007852,0.007995,0.009635,0.001640
TTTTTTGTACCCAC,0.008473,0.008332,0.009740,0.001408
TTTTTTGTACGATT,0.008791,0.011268,0.011268,0.000000


In [278]:
# see if our function matches y_1

y2 = pythia.perturb(target='Ccl8', n_propagation=1)
y2 = pd.DataFrame(y2, columns=adata.var_names, index=adata.obs_names)



Interactions: 100%|██████████| 2000/2000 [03:42<00:00,  9.01it/s]
Running simulation 1/1: 100%|██████████| 11567/11567 [08:28<00:00, 22.73it/s]


In [308]:
preds['y_2'] = y2['Il2']
preds['y_1-y_0'] = preds['y_1'] - preds['y_0']
preds

Unnamed: 0,Il2,y_0,y_1,y_1-y_0,y_2
AAAAAACGGTAGAT,0.008844,0.008026,0.008026,0.000000,0.000000
AAAAAATCCGGCGA,0.009123,0.010604,0.010668,0.000064,-0.000124
AAAAACCGTCAGAC,0.009442,0.010425,0.010425,0.000000,0.000000
AAAAACGCTCCGAC,0.016603,0.011082,0.011082,0.000000,0.000000
AAAAAGCACGTGTT,0.010626,0.010255,0.010255,0.000000,0.000000
...,...,...,...,...,...
TTTTTTCGCGGCTA,0.008604,0.006488,0.010800,0.004311,0.004278
TTTTTTGGGCTGGT,0.007852,0.007995,0.009635,0.001640,0.000046
TTTTTTGTACCCAC,0.008473,0.008332,0.009740,0.001408,0.000112
TTTTTTGTACGATT,0.008791,0.011268,0.011268,0.000000,0.000000


In [279]:
betadata[[x for x in betadata.columns if 'Ccl8' in x]]

Unnamed: 0,beta_Ccl8$Ccr2,beta_Ccl8$Ccr5,beta_Ccl8$Ackr4,beta_Ccl8$Ackr1
AAAAAACGGTAGAT,0.000000,0.000000,-0.000000,0.0
AAAAAATCCGGCGA,0.002998,0.003569,-0.021198,-0.0
AAAAACCGTCAGAC,0.000000,0.000000,-0.000000,-0.0
AAAAACGCTCCGAC,0.000000,0.000000,-0.000000,-0.0
AAAAAGCACGTGTT,0.000000,0.000000,-0.000000,0.0
...,...,...,...,...
TTTTTTCGCGGCTA,0.001820,0.002936,-0.020040,-0.0
TTTTTTGGGCTGGT,-0.000000,-0.005951,-0.023696,0.0
TTTTTTGTACCCAC,-0.000000,-0.005446,-0.015015,0.0
TTTTTTGTACGATT,0.000000,0.000000,-0.000000,-0.0


In [284]:
gene_df_1[['Ccr2', 'Ccr5', 'Ackr4', 'Ackr1']]

Unnamed: 0,Ccr2,Ccr5,Ackr4,Ackr1
AAAAAACGGTAGAT,0.011819,0.002540,0.001697,0.003498
AAAAAATCCGGCGA,0.010484,0.004072,0.000128,0.000753
AAAAACCGTCAGAC,0.002632,0.001709,0.002440,0.000749
AAAAACGCTCCGAC,0.004250,0.001302,0.001811,0.001307
AAAAAGCACGTGTT,0.002533,0.000979,0.000553,0.001199
...,...,...,...,...
TTTTTTCGCGGCTA,0.004510,0.002419,0.004698,0.000772
TTTTTTGGGCTGGT,0.004622,0.002026,0.000118,0.002365
TTTTTTGTACCCAC,0.003279,0.001890,0.000556,0.000168
TTTTTTGTACGATT,0.012496,0.001979,0.002260,0.002775


In [288]:
x = betadata[[x for x in betadata.columns if 'beta_Ccl8' in x]].values * gene_df_1[['Ccr2', 'Ccr5', 'Ackr4', 'Ackr1']]
x = x.sum(axis=1)
x * (weighted_ligands_0['Ccl8'] - weighted_ligands_1['Ccl8'])

AAAAAACGGTAGAT    0.000000
AAAAAATCCGGCGA    0.000124
AAAAACCGTCAGAC    0.000000
AAAAACGCTCCGAC    0.000000
AAAAAGCACGTGTT    0.000000
                    ...   
TTTTTTCGCGGCTA   -0.004278
TTTTTTGGGCTGGT   -0.000046
TTTTTTGTACCCAC   -0.000112
TTTTTTGTACGATT    0.000000
TTTTTTTTGCTTTA    0.000000
Length: 11567, dtype: float64

In [307]:
x = np.tile((weighted_ligands_0['Ccl8'] - weighted_ligands_1['Ccl8']), (4, 1)).T *  gene_df_1[['Ccr2', 'Ccr5', 'Ackr4', 'Ackr1']].values
np.sum(x * betadata[[x for x in betadata.columns if 'beta_Ccl8' in x]].values, axis=1)

array([ 0.        ,  0.0001239 ,  0.        , ..., -0.00011201,
        0.        ,  0.        ])

In [None]:
betadata[[x for x in betadata.columns if 'Mga' in x]]

In [None]:
x = betadata[[x for x in betadata.columns if '#Mga' in x]].values * weighted_ligands_1[['Ebi3', 'Il21', 'Il24', 'Il12b']]
x['Mga'] = betadata['beta_Mga']
x = x.sum(axis=1)
x * gene_df['Mga']

AAAAAACGGTAGAT    0.000623
AAAAAATCCGGCGA   -0.000029
AAAAACCGTCAGAC    0.000228
AAAAACGCTCCGAC    0.000434
AAAAAGCACGTGTT    0.000364
                    ...   
TTTTTTCGCGGCTA   -0.000026
TTTTTTGGGCTGGT    0.000022
TTTTTTGTACCCAC    0.000036
TTTTTTGTACGATT    0.000846
TTTTTTTTGCTTTA    0.000747
Length: 11567, dtype: float64

In [168]:
gene_df['Mga'].rename('beta_Mga')

AAAAAACGGTAGAT    0.016904
AAAAAATCCGGCGA    0.021324
AAAAACCGTCAGAC    0.017552
AAAAACGCTCCGAC    0.026500
AAAAAGCACGTGTT    0.017608
                    ...   
TTTTTTCGCGGCTA    0.021119
TTTTTTGGGCTGGT    0.020035
TTTTTTGTACCCAC    0.029815
TTTTTTGTACGATT    0.023220
TTTTTTTTGCTTTA    0.017846
Name: beta_Mga, Length: 11567, dtype: float64

In [173]:
(wbetas_dict.data['Il2'].wbetas[[x for x in wbetas_dict.data['Il2'].wbetas.columns if 'Mga' in x]]).multiply(gene_df['Mga'], axis=0)


Unnamed: 0,beta_Mga
AAAAAACGGTAGAT,0.000623
AAAAAATCCGGCGA,-0.000029
AAAAACCGTCAGAC,0.000228
AAAAACGCTCCGAC,0.000434
AAAAAGCACGTGTT,0.000364
...,...
TTTTTTCGCGGCTA,-0.000026
TTTTTTGGGCTGGT,0.000022
TTTTTTGTACCCAC,0.000036
TTTTTTGTACGATT,0.000846


In [None]:
wbetas_dict = pythia._get_wbetas_dict(pythia.beta_dict, weighted_ligands_1, gene_df_1.values)

In [None]:

pythia.adata.layers['simulated_count']

In [None]:
pythia.adata.layers['imputed_count']