---

In [None]:
%cd ../../../..

In [2]:
import numpy as np

In [3]:
from sklearn.preprocessing import minmax_scale, robust_scale, quantile_transform

In [4]:
import pandas as pd

In [5]:
import xarray

In [6]:
from bokeh.io import output_notebook, export_svg, export_png
from bokeh.resources import INLINE
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

In [7]:
output_notebook(resources=INLINE)

---

Load data:

In [8]:
map_ds = xarray.load_dataset("pipeline_activation/model-output/global_map.h5")

E = xarray.concat(
    [map_ds['E'], map_ds['Ederiv'].rename({"deriv_coef": "coef"})],
    dim='coef',
)

w_expr = (
    (E.sel(coef='Intercept') >= -0.75)
    & (E.sel(coef='OOPS_abundance') >= -0.25)
)
assert w_expr.mean().item() >= 0.6

In [9]:
hdi_ds = xarray.open_dataset(
    "pipeline_activation/model-output/pergene_hdis.h5"
)

assert 'H90' in list(hdi_ds.variables)

if 'Hd90' in list(hdi_ds.variables):
    coefhdi = xarray.concat(
        [
            hdi_ds['H90'].load(),
            hdi_ds['Hd90'].load().rename({"deriv_coef": "coef"}),
        ],
        "coef",
    )
else:
    print("Missing derived coeffs")
    coefhdi = hdi_ds['H90'].load()

Load metadata:

In [10]:
hgnc_metadata = pd.read_csv(
    "pipeline_activation/raw/hgnc_metadata.tsv.gz",
    sep="\t", index_col=0,
)

---

Prepare plotting data:

Some coefs, we clip to 0, if the HDI overlaps with 0.

In [11]:
ds = ColumnDataSource(
    data=E.to_pandas(),
)

In [12]:
ds.data['symbols'] = hgnc_metadata.loc[E.coords['gene'].values]['symbols'].values

In [13]:
ds.data['size'] = (15 - 2) * (minmax_scale(
    quantile_transform(
        robust_scale(ds.data['OOPS_abundance'], quantile_range=(0.2, 0.8), with_centering=True, with_scaling=False).reshape(-1, 1)
    ).squeeze()
) ** 3.) + 2

In [14]:
ds.data.keys()

dict_keys(['gene', 'Intercept', 'libtype[S.OOPS]', 'act[S.no]', 'cl[S.no]', 'libtype[S.OOPS]:act[S.no]', 'libtype[S.OOPS]:cl[S.no]', 'act[S.no]:cl[S.no]', 'libtype[S.OOPS]:act[S.no]:cl[S.no]', 'OOPS_abundance', 'activation', 'activation_FP', 'RBPness', 'lt_deltaRBPness', 'act_deltaRBPness', 'act_deltaCL', 'symbols', 'size'])

Add clipped (HDI90%) coeffs as keys:

In [15]:
H = xarray.concat([
    hdi_ds['H90'],
    hdi_ds['Hd90'].rename({"deriv_coef": "coef"}),
], dim='coef')

In [16]:
H_clipped = (
    H.max(dim='hdi').clip(
        max=H.min(dim='hdi').clip(min=0)
    )
)

In [17]:
for coef in H_clipped.coords['coef']:
    ds.data['%s_clipped' % coef.item()] = H_clipped.sel(coef=coef).values

In [18]:
ds.data.keys()

dict_keys(['gene', 'Intercept', 'libtype[S.OOPS]', 'act[S.no]', 'cl[S.no]', 'libtype[S.OOPS]:act[S.no]', 'libtype[S.OOPS]:cl[S.no]', 'act[S.no]:cl[S.no]', 'libtype[S.OOPS]:act[S.no]:cl[S.no]', 'OOPS_abundance', 'activation', 'activation_FP', 'RBPness', 'lt_deltaRBPness', 'act_deltaRBPness', 'act_deltaCL', 'symbols', 'size', 'Intercept_clipped', 'libtype[S.OOPS]_clipped', 'act[S.no]_clipped', 'cl[S.no]_clipped', 'libtype[S.OOPS]:act[S.no]_clipped', 'libtype[S.OOPS]:cl[S.no]_clipped', 'act[S.no]:cl[S.no]_clipped', 'libtype[S.OOPS]:act[S.no]:cl[S.no]_clipped', 'OOPS_abundance_clipped', 'activation_clipped', 'activation_FP_clipped', 'RBPness_clipped', 'lt_deltaRBPness_clipped', 'act_deltaRBPness_clipped', 'act_deltaCL_clipped'])

In [19]:
hover_tooltips=[
    ("(x,y)", "($x, $y)"),
    ('symbols', '@symbols'),
    ('hgnc_ids', '@gene'),
]

f = figure(width=800, height=740, tooltips=hover_tooltips)

f.scatter(
    x='activation',
    y='RBPness',
    size='size',
    alpha=0.2,
    source=ds,
)

show(f)

---

### Export tables

#### Raw data

In [20]:
intensities = pd.read_csv(
    "pipeline_activation/intensity-values.tsv",
    sep="\t", index_col=0,
)

w_valid = intensities['w_valid']

In [21]:
assert map_ds.sizes['gene'] == intensities["w_valid"].sum()

In [22]:
sample_subset = pd.read_csv("pipeline_activation/samplesheet.tsv", sep="\t")

In [23]:
obs = intensities[sample_subset['sample_label'].to_list()]

In [24]:
obs.shape

(7531, 24)

In [25]:
cutoff_cols = [
    col for col in intensities.columns
    if col.startswith("n_donor")
    or col.startswith("n_cond")
    or col.startswith("w_")
]

In [26]:
with pd.ExcelWriter(
    "pipeline_activation/raw/observed-data.xlsx",
    mode="w",
    engine="openpyxl",
    ) as excel_writer:
     pd.merge(
         intensities[['symbols']],
         obs,
         left_index=True, right_index=True,
     ).to_excel(excel_writer, sheet_name="raw_observations")

In [27]:
with pd.ExcelWriter(
    "pipeline_activation/raw/observed-data.xlsx",
    mode="a",
    engine="openpyxl",
    ) as excel_writer:
     pd.merge(
         intensities[['symbols']],
         intensities[cutoff_cols],
         left_index=True, right_index=True,
     ).to_excel(excel_writer, sheet_name="cutoffs")

#### HDIs (and clipped HDIs, MAP)

In [28]:
V = xarray.concat([
    hdi_ds['H90'],
    hdi_ds['Hd90'].rename({"deriv_coef": "coef"}),
], dim='coef')

In [29]:
V.to_dataframe().iloc[:, 0].reset_index().pivot(columns=['coef', 'hdi'], index='gene')

Unnamed: 0_level_0,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90
coef,Intercept,Intercept,libtype[S.OOPS],libtype[S.OOPS],act[S.no],act[S.no],cl[S.no],cl[S.no],libtype[S.OOPS]:act[S.no],libtype[S.OOPS]:act[S.no],...,activation_FP,activation_FP,RBPness,RBPness,lt_deltaRBPness,lt_deltaRBPness,act_deltaRBPness,act_deltaRBPness,act_deltaCL,act_deltaCL
hdi,lower,higher,lower,higher,lower,higher,lower,higher,lower,higher,...,lower,higher,lower,higher,lower,higher,lower,higher,lower,higher
gene,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
HGNC:10011,-1.473887,-0.290737,-1.531506,-0.346823,-0.396788,0.365119,-0.385455,0.395676,-0.376958,0.385011,...,-0.005996,0.019663,-0.403183,0.376306,-0.393809,0.385750,-0.401524,0.350636,-0.277874,0.273487
HGNC:10018,-0.074719,-0.039954,0.081274,0.116365,-0.007024,0.027709,0.001778,0.035273,-0.002341,0.032676,...,-0.009883,0.015494,-0.033745,-0.010507,-0.041791,-0.006252,0.002952,0.027136,-0.007422,0.009743
HGNC:10019,-0.003793,0.037381,-0.058744,-0.016545,-0.016645,0.025392,-0.027906,0.014539,-0.013640,0.027719,...,-0.011697,0.014030,-0.007431,0.025854,-0.007440,0.034314,-0.009512,0.023177,-0.008673,0.010504
HGNC:10021,-1.978536,-0.753959,0.370872,1.548211,-0.599722,0.288897,-0.287116,0.596800,-0.637198,0.264749,...,-0.416180,0.417041,-0.306682,-0.006308,-0.595632,0.281566,-0.016617,0.282590,0.005891,0.300528
HGNC:10059,-0.397919,-0.205495,-0.088588,0.104853,0.007234,0.201896,-0.158249,0.035565,-0.018566,0.174731,...,-0.025309,0.001847,-0.031598,0.156872,-0.025202,0.166340,-0.013013,0.176161,-0.018166,0.006939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HGNC:9986,-0.714573,-0.147470,-0.560533,-0.001271,-0.166423,0.197937,-0.151347,0.210177,-0.155623,0.207277,...,-0.007247,0.017544,-0.187422,0.173813,-0.178031,0.182721,0.017443,0.578781,-0.000657,0.314784
HGNC:9988,-2.058675,-0.753144,-1.230291,0.035003,-0.643441,0.262843,-0.585354,0.354382,-0.292260,0.629644,...,0.000054,0.311090,-0.425615,0.426720,-0.607664,0.332691,-0.404467,0.424455,-0.267009,0.328856
HGNC:9992,0.122858,0.160806,-0.115942,-0.079221,-0.022518,0.013853,0.000142,0.036999,-0.019726,0.016409,...,-0.011657,0.014277,-0.032847,-0.006433,-0.037439,-0.000450,-0.017071,0.007871,-0.009931,0.008773
HGNC:9996,-0.262571,-0.004778,-0.296947,-0.041889,-0.026197,0.228619,-0.248469,0.004731,-0.034139,0.220902,...,-0.018258,0.007504,0.003644,0.251759,-0.001266,0.249936,-0.004612,0.245232,-0.001444,0.025619


In [30]:
UNCLIPPED_COEFS = ['Intercept']

In [31]:
V_clipped = xarray.concat([
    V.sel(coef=[coef for coef in V.coords['coef'].values if coef in UNCLIPPED_COEFS]).median(dim='hdi'),
    (
        V.sel(coef=[coef for coef in V.coords['coef'].values if coef not in UNCLIPPED_COEFS])
        .max(dim='hdi').clip(
            max=(
                V.sel(coef=[coef for coef in V.coords['coef'].values if coef not in UNCLIPPED_COEFS])
                .min(dim='hdi').clip(min=0)
            )
        )
    ),
], dim='coef')

In [32]:
V_clipped.to_dataframe().iloc[:, 0].reset_index().pivot(columns=['coef'], index='gene')

Unnamed: 0_level_0,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90,H90
coef,Intercept,OOPS_abundance,RBPness,act[S.no],act[S.no]:cl[S.no],act_deltaCL,act_deltaRBPness,activation,activation_FP,cl[S.no],libtype[S.OOPS],libtype[S.OOPS]:act[S.no],libtype[S.OOPS]:act[S.no]:cl[S.no],libtype[S.OOPS]:cl[S.no],lt_deltaRBPness
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
HGNC:10011,-0.882312,-0.338228,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.346823,0.000000,0.000000,0.000000,0.000000
HGNC:10018,-0.057336,0.008216,-0.010507,0.000000,0.000559,0.000000,0.002952,0.000000,0.000000,0.001778,0.081274,0.000000,0.000000,0.006252,-0.006252
HGNC:10019,0.016794,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.016545,0.000000,0.000000,0.000000,0.000000
HGNC:10021,-1.366247,-0.052117,-0.006308,0.000000,0.000000,0.005891,0.000000,0.000000,0.000000,0.000000,0.370872,0.000000,0.000000,0.000000,0.000000
HGNC:10059,-0.301707,-0.052792,0.000000,0.007234,0.000000,0.000000,0.000000,-0.007234,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HGNC:9986,-0.431021,-0.073296,0.000000,0.000000,0.018471,0.000000,0.017443,0.000000,0.000000,0.000000,-0.001271,0.000000,0.022535,0.000000,0.000000
HGNC:9988,-1.405910,-0.357424,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000054,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HGNC:9992,0.141832,0.010184,-0.006433,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000142,-0.079221,0.000000,0.000000,0.000450,-0.000450
HGNC:9996,-0.133675,-0.025378,0.003644,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.041889,0.000000,0.000000,0.000000,0.000000


In [33]:
with pd.ExcelWriter(
    "pipeline_activation/model-output/oops_2022_model_output.xlsx",
    mode="w",
    engine="openpyxl",
    ) as excel_writer:
     pd.merge(
         hgnc_metadata[['symbols']],
         (
             V_clipped
             .to_dataframe()
             .iloc[:, 0]
             .reset_index()
             .pivot(columns=['coef'], index='gene')
             .pipe(lambda df: df.set_axis(df.columns.to_flat_index(), axis=1))
         ),
         left_index=True, right_index=True,
     ).to_excel(excel_writer, sheet_name="clipped_hdis")

In [34]:
with pd.ExcelWriter(
    "pipeline_activation/model-output/oops_2022_model_output.xlsx",
    mode="a",
    engine="openpyxl",
    ) as excel_writer:
        pd.merge(
            hgnc_metadata[['symbols']],
            (
                V
                .to_dataframe()
                .iloc[:, 0]
                .reset_index()
                .pivot(columns=['coef', 'hdi'], index='gene')
                .pipe(lambda df: df.set_axis(df.columns.to_flat_index(), axis=1))
             ),
            left_index=True, right_index=True,
        ).to_excel(excel_writer, sheet_name="hdis")

In [35]:
with pd.ExcelWriter(
    "pipeline_activation/model-output/oops_2022_model_output.xlsx",
    mode="a",
    engine="openpyxl",
    ) as excel_writer:
        pd.merge(
            hgnc_metadata[['symbols']],
            (
                E
                .to_dataframe()
                .iloc[:, 0]
                .rename("MAP")
                .reset_index()
                .pivot(columns=['coef'], index='gene')
                .pipe(lambda df: df.set_axis(df.columns.to_flat_index(), axis=1))
             ),
            left_index=True, right_index=True,
        ).to_excel(excel_writer, sheet_name="MAP")

---